Open-Meteo maintains an API for historical weather that allows for non-commercial usage of historical weather data maintained by the website.
This file builds on _v001 and _v002 to run exploratory analysis on some historical weather data.
The exploration process uses tidyverse, ranger, several generic custom functions, and several functions specific to Open Meteo processing. First, tidyverse, ranger, and the generic functions are loaded:
library(tidyverse) # tidyverse functionality is included throughout
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ranger) # predict() does not work on ranger objects unless ranger has been called
## Warning: package 'ranger' was built under R version 4.2.3
source("./Generic_Added_Utility_Functions_202105_v001.R") # Basic functions
Next, specific functions written in _v001 are copied:
# Helper function for reading a partial CSV file
partialCSVRead <- function(loc, firstRow=1L, lastRow=+Inf, col_names=TRUE, ...) {
# FUNCTION arguments
# loc: file location
# firstRow: first row that is relevant to the partial file read (whether header line or data line)
# last Row: last row that is relevant to the partial file read (+Inf means read until last line of file)
# col_names: the col_names parameter passed to readr::read_csv
# TRUE means header=TRUE (get column names from file, read data starting on next line)
# FALSE means header=FALSE (auto-generate column names, read data starting on first line)
# character vector means use these as column names (read data starting on first line)
# ...: additional arguments passed to read_csv
# Read the file and return
# skip: rows to be skipped are all those prior to firstRow
# n_max: maximum rows read are lastRow-firstRow, with an additional data row when col_names is not TRUE
readr::read_csv(loc,
col_names=col_names,
skip=firstRow-1,
n_max=lastRow-firstRow+ifelse(isTRUE(col_names), 0, 1),
...
)
}
# Get the break points for gaps in a vector (e.g., 0, 3, 5:8, 20 has break points 0, 3, 5, 20 and 0, 3, 8, 30)
vecGaps <- function(x, addElements=c(), sortUnique=TRUE) {
if(length(addElements)>0) x <- c(addElements, x)
if(isTRUE(sortUnique)) x <- unique(sort(x))
list("starts"=c(x[is.na(lag(x)) | x-lag(x)>1], +Inf),
"ends"=x[is.na(lead(x)) | lead(x)-x>1]
)
}
# Find the break points in a single file
flatFileGaps <- function(loc) {
which(stringr::str_length(readLines(loc))==0) %>% vecGaps(addElements=0)
}
# Read all relevant data as CSV with header
readMultiCSV <- function(loc, col_names=TRUE, ...) {
gaps <- flatFileGaps(loc)
lapply(seq_along(gaps$ends),
FUN=function(x) partialCSVRead(loc,
firstRow=gaps$ends[x]+1,
lastRow=gaps$starts[x+1]-1,
col_names=col_names,
...
)
)
}
# Create URL with specified parameters for downloading data from Open Meteo
openMeteoURLCreate <- function(mainURL="https://archive-api.open-meteo.com/v1/archive",
lat=45,
lon=-90,
startDate=paste(year(Sys.Date())-1, "01", "01", sep="-"),
endDate=paste(year(Sys.Date())-1, "12", "31", sep="-"),
hourlyMetrics=NULL,
dailyMetrics=NULL,
tz="GMT",
...
) {
# Create formatted string
fString <- paste0(mainURL,
"?latitude=",
lat,
"&longitude=",
lon,
"&start_date=",
startDate,
"&end_date=",
endDate
)
if(!is.null(hourlyMetrics)) fString <- paste0(fString, "&hourly=", hourlyMetrics)
if(!is.null(dailyMetrics)) fString <- paste0(fString, "&daily=", dailyMetrics)
# Return the formatted string
paste0(fString, "&timezone=", stringr::str_replace(tz, "/", "%2F"), ...)
}
# Helper function to simplify entry of parameters for Open Meteo download requests
helperOpenMeteoURL <- function(cityName=NULL,
lat=NULL,
lon=NULL,
hourlyMetrics=NULL,
hourlyIndices=NULL,
hourlyDesc=tblMetricsHourly,
dailyMetrics=NULL,
dailyIndices=NULL,
dailyDesc=tblMetricsDaily,
startDate=NULL,
endDate=NULL,
tz=NULL,
...
) {
# Convert city to lat/lon if lat/lon are NULL
if(is.null(lat) | is.null(lon)) {
if(is.null(cityName)) stop("\nMust provide lat/lon or city name available in maps::us.cities\n")
cityData <- maps::us.cities %>% tibble::as_tibble() %>% filter(name==cityName)
if(nrow(cityData)!=1) stop("\nMust provide city name that maps uniquely to maps::us.cities$name\n")
lat <- cityData$lat[1]
lon <- cityData$long[1]
}
# Get hourly metrics by index if relevant
if(is.null(hourlyMetrics) & !is.null(hourlyIndices)) {
hourlyMetrics <- hourlyDesc %>% slice(hourlyIndices) %>% pull(metric)
hourlyMetrics <- paste0(hourlyMetrics, collapse=",")
cat("\nHourly metrics created from indices:", hourlyMetrics, "\n\n")
}
# Get daily metrics by index if relevant
if(is.null(dailyMetrics) & !is.null(dailyIndices)) {
dailyMetrics <- dailyDesc %>% slice(dailyIndices) %>% pull(metric)
dailyMetrics <- paste0(dailyMetrics, collapse=",")
cat("\nDaily metrics created from indices:", dailyMetrics, "\n\n")
}
# Use default values from OpenMeteoURLCreate() for startDate, endDate, and tz if passed as NULL
if(is.null(startDate)) startDate <- eval(formals(openMeteoURLCreate)$startDate)
if(is.null(endDate)) endDate <- eval(formals(openMeteoURLCreate)$endDate)
if(is.null(tz)) tz <- eval(formals(openMeteoURLCreate)$tz)
# Create and return URL
openMeteoURLCreate(lat=lat,
lon=lon,
startDate=startDate,
endDate=endDate,
hourlyMetrics=hourlyMetrics,
dailyMetrics=dailyMetrics,
tz=tz,
...
)
}
# Read JSON data returned from Open Meteo
readOpenMeteoJSON <- function(js, mapDaily=tblMetricsDaily, mapHourly=tblMetricsHourly) {
# FUNCTION arguments:
# js: JSON list returned by download from Open-Meteo
# mapDaily: mapping file for daily metrics
# mapHourly: mapping file for hourly metrics
# Get the object and names
jsObj <- jsonlite::read_json(js, simplifyVector = TRUE)
nms <- jsObj %>% names()
cat("\nObjects in JSON include:", paste(nms, collapse=", "), "\n\n")
# Set default objects as NULL
tblDaily <- NULL
tblHourly <- NULL
tblUnitsDaily <- NULL
tblUnitsHourly <- NULL
# Get daily and hourly as tibble if relevant
if("daily" %in% nms) tblDaily <- jsObj$daily %>% tibble::as_tibble() %>% omProcessDaily()
if("hourly" %in% nms) tblHourly <- jsObj$hourly %>% tibble::as_tibble() %>% omProcessHourly()
# Helper function for unit conversions
helperMetricUnit <- function(x, mapper, desc=NULL) {
if(is.null(desc))
desc <- as.list(match.call())$x %>%
deparse() %>%
stringr::str_replace_all(pattern=".*\\$", replacement="")
x %>%
tibble::as_tibble() %>%
pivot_longer(cols=everything()) %>%
left_join(mapper, by=c("name"="metric")) %>%
mutate(value=stringr::str_replace(value, "\u00b0", "deg ")) %>%
mutate(metricType=desc) %>%
select(metricType, everything())
}
# Get the unit descriptions
if("daily_units" %in% nms) tblUnitsDaily <- helperMetricUnit(jsObj$daily_units, mapDaily)
if("hourly_units" %in% nms) tblUnitsHourly <- helperMetricUnit(jsObj$hourly_units, mapHourly)
if(is.null(tblUnitsDaily) & !is.null(tblUnitsHourly)) tblUnits <- tblUnitsHourly
else if(!is.null(tblUnitsDaily) & is.null(tblUnitsHourly)) tblUnits <- tblUnitsDaily
else if(!is.null(tblUnitsDaily) & !is.null(tblUnitsHourly))
tblUnits <- bind_rows(tblUnitsHourly, tblUnitsDaily)
else tblUnits <- NULL
# Put everything else together
tblDescription <- jsObj[setdiff(nms, c("hourly", "hourly_units", "daily", "daily_units"))] %>%
tibble::as_tibble()
# Return the list objects
list(tblDaily=tblDaily, tblHourly=tblHourly, tblUnits=tblUnits, tblDescription=tblDescription)
}
# Return Open meteo metadata in prettified format
prettyOpenMeteoMeta <- function(df, extr="tblDescription") {
if("list" %in% class(df)) df <- df[[extr]]
for(name in names(df)) {
cat("\n", name, ": ", df %>% pull(name), sep="")
}
cat("\n\n")
}
# Process Open Meteo daily data
omProcessDaily <- function(tbl, extr="tblDaily") {
if("list" %in% class(tbl)) tbl <- tbl[[extr]]
tbl %>% mutate(date=lubridate::ymd(time)) %>% select(date, everything())
}
# Process Open meteo hourly data
omProcessHourly <- function(tbl, extr="tblHourly") {
if("list" %in% class(tbl)) tbl <- tbl[[extr]]
tbl %>%
mutate(origTime=time,
time=lubridate::ymd_hm(time),
date=lubridate::date(time),
hour=lubridate::hour(time)
) %>%
select(time, date, hour, everything())
}
# Simple predictive model for categorical variable
simpleOneVarPredict <- function(df,
tgt,
prd,
dfTest=NULL,
nPrint=30,
showPlot=TRUE,
returnData=TRUE
) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble with key elements (training data set)
# tgt: target variable
# prd: predictor variable
# dfTest: test dataset for applying predictions
# nPrint: maximum number of lines of confusion matrix to print
# 0 means do not print any summary statistics
# showPlot: boolean, should overlap plot be created and shown?
# Counts of predictor to target variable
dfPred <- df %>%
group_by(across(all_of(c(prd, tgt)))) %>%
summarize(n=n(), .groups="drop") %>%
arrange(across(all_of(prd)), desc(n)) %>%
group_by(across(all_of(prd))) %>%
mutate(correct=row_number()==1, predicted=first(get(tgt))) %>%
ungroup()
# Confusion matrix and accuracy
dfConf <- dfPred %>%
group_by(across(all_of(c(tgt, "correct")))) %>%
summarize(n=sum(n), .groups="drop") %>%
pivot_wider(id_cols=tgt, names_from=correct, values_from=n, values_fill=0) %>%
mutate(n=`TRUE`+`FALSE`,
pctCorrect=`TRUE`/n,
pctNaive=1/(nrow(.)),
lift=pctCorrect/pctNaive-1
)
# Overall confusion matrix
dfConfAll <- dfConf %>%
summarize(nMax=max(n), across(c(`FALSE`, `TRUE`, "n"), sum)) %>%
mutate(pctCorrect=`TRUE`/n,
pctNaive=nMax/n,
lift=pctCorrect/pctNaive-1,
nBucket=length(unique(dfPred[[prd]]))
)
# Print confusion matrices
if(nPrint > 0) {
cat("\nAccuracy by target subgroup (training data):\n")
dfConf %>% print(n=nPrint)
cat("\nOverall Accuracy (training data):\n")
dfConfAll %>% print(n=nPrint)
}
# Plot of overlaps
if(isTRUE(showPlot)) {
p1 <- dfPred %>%
group_by(across(c(all_of(tgt), "predicted", "correct"))) %>%
summarize(n=sum(n), .groups="drop") %>%
ggplot(aes(x=get(tgt), y=predicted)) +
labs(x="Actual",
y="Predicted",
title=paste0("Training data - Actual vs. predicted ", tgt),
subtitle=paste0("(using ", prd, ")")
) +
geom_text(aes(label=n)) +
geom_tile(aes(fill=correct), alpha=0.25)
print(p1)
}
# Create metrics for test dataset if requested
if(!is.null(dfTest)) {
# Get maximum category from training data
mostPredicted <- count(dfPred, predicted, wt=n) %>% slice(1) %>% pull(predicted)
# Get mapping of metric to prediction
dfPredict <- dfPred %>%
group_by(across(all_of(c(prd, "predicted")))) %>%
summarize(n=sum(n), .groups="drop")
# Create predictions for test data
dfPredTest <- dfTest %>%
select(all_of(c(prd, tgt))) %>%
left_join(select(dfPredict, -n)) %>%
replace_na(list(predicted=mostPredicted)) %>%
group_by(across(all_of(c(prd, tgt, "predicted")))) %>%
summarize(n=n(), .groups="drop") %>%
mutate(correct=(get(tgt)==predicted))
# Create confusion statistics for test data
dfConfTest <- dfPredTest %>%
group_by(across(all_of(c(tgt, "correct")))) %>%
summarize(n=sum(n), .groups="drop") %>%
pivot_wider(id_cols=tgt, names_from=correct, values_from=n, values_fill=0) %>%
mutate(n=`TRUE`+`FALSE`,
pctCorrect=`TRUE`/n,
pctNaive=1/(nrow(.)),
lift=pctCorrect/pctNaive-1
)
# Overall confusion matrix for test data
dfConfAllTest <- dfConfTest %>%
summarize(nMax=max(n), across(c(`FALSE`, `TRUE`, "n"), sum)) %>%
mutate(pctCorrect=`TRUE`/n,
pctNaive=nMax/n,
lift=pctCorrect/pctNaive-1,
nBucket=length(unique(dfConfTest[[prd]]))
)
# Print confusion matrices
if(nPrint > 0) {
cat("\nAccuracy by target subgroup (testing data):\n")
dfConfTest %>% print(n=nPrint)
cat("\nOverall Accuracy (testing data):\n")
dfConfAllTest %>% print(n=nPrint)
}
} else {
dfPredTest <- NULL
dfConfTest <- NULL
dfConfAllTest <- NULL
}
# Return data if requested
if(isTRUE(returnData)) list(dfPred=dfPred,
dfConf=dfConf,
dfConfAll=dfConfAll,
dfPredTest=dfPredTest,
dfConfTest=dfConfTest,
dfConfAllTest=dfConfAllTest
)
}
# Fit a single predictor to a single categorical variable
simpleOneVarFit <- function(df,
tgt,
prd,
rankType="last",
naMethod=TRUE
) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble with key elements (training data set)
# tgt: target variable
# prd: predictor variable
# rankType: method for breaking ties of same n, passed to base::rank as ties.method=
# naMethod: method for handling NA in ranks, passed to base::rank as na.last=
# Counts of predictor to target variable, and associated predictions
df %>%
group_by(across(all_of(c(prd, tgt)))) %>%
summarize(n=n(), .groups="drop") %>%
arrange(across(all_of(prd)), desc(n), across(all_of(tgt))) %>%
group_by(across(all_of(prd))) %>%
mutate(rankN=n()+1-rank(n, ties.method=rankType, na.last=naMethod)) %>%
arrange(across(all_of(prd)), rankN) %>%
ungroup()
}
# Create categorical predictions mapper
simpleOneVarMapper <- function(df, tgt, prd) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble from SimpleOneVarFit()
# tgt: target variable
# prd: predictor variable
# Get the most common actual results
dfCommon <- df %>% count(across(all_of(tgt)), wt=n, sort=TRUE)
# Get the predictions
dfPredictor <- df %>%
group_by(across(all_of(prd))) %>%
filter(row_number()==1) %>%
select(all_of(c(prd, tgt))) %>%
ungroup()
list(dfPredictor=dfPredictor, dfCommon=dfCommon)
}
# Map the categorical predictions to unseen data
simpleOneVarApplyMapper <- function(df,
tgt,
prd,
mapper,
mapperDF="dfPredictor",
mapperDefault="dfCommon",
prdName="predicted"
) {
# FUNCTION ARGUMENTS:
# df: data frame containing prd for predicting tgt
# tgt: target variable in df
# prd: predictor variable in df
# mapper: mapping list from sinpleOneVarMapper()
# mapperDF: element that can be used to merge mappings
# mapperDefault: element that can be used for NA resulting from merging mapperDF
# prdName: name for the prediction variable
# Extract the mapper and default value
vecRename <- c(prdName) %>% purrr::set_names(tgt)
dfMap <- mapper[[mapperDF]] %>% select(all_of(c(prd, tgt))) %>% colRenamer(vecRename=vecRename)
chrDefault <- mapper[[mapperDefault]] %>% slice(1) %>% pull(tgt)
# Merge mappings to df
df %>%
left_join(dfMap, by=prd) %>%
replace_na(list("predicted"=chrDefault))
}
# Create confusion matrix data for categorical predictions
simpleOneVarConfusionData <- function(df,
tgtOrig,
tgtPred,
otherVars=c(),
weightBy="n"
) {
# FUNCTION ARGUMENTS:
# df: data frame from simpleOneVarApplyMapper()
# tgtOrig: original target variable name in df
# tgtPred: predicted target variable name in df
# otherVars: other variables to be kept (will be grouping variables)
# weightBy: weighting variable for counts in df (NULL means count each row of df as 1)
# Confusion matrix data creation
df %>%
group_by(across(all_of(c(tgtOrig, tgtPred, otherVars)))) %>%
summarize(n=if(!is.null(weightBy)) sum(get(weightBy)) else n(), .groups="drop") %>%
mutate(correct=get(tgtOrig)==get(tgtPred))
}
# Print and plot confusion matrix for categorical predictions
simpleOneVarConfusionReport <- function(df,
tgtOrig,
tgtPred,
otherVars=c(),
printConf=TRUE,
printConfOrig=printConf,
printConfPred=printConf,
printConfOverall=printConf,
plotConf=TRUE,
plotDesc="",
nBucket=NA,
predictorVarName="",
returnData=FALSE
) {
# FUNCTION ARGUMENTS:
# df: data frame from simpleOneVarConfusionData()
# tgtOrig: original target variable name in df
# tgtPred: predicted target variable name in df
# otherVars: other variables to be kept (will be grouping variables) - NOT IMPLEMENTED
# printConf: boolean, should confusion matrix data be printed? Applies to all three
# printConfOrig: boolean, should confusion data be printed based on original target variable?
# printConfPred: boolean, should confusion data be printed based on predicted target variable?
# printConfOverall: boolean, should overall confusion data be printed?
# plotConf: boolean, should confusion overlap data be plotted?
# plotDesc: descriptive label to be included in front of plot title
# nBucket: number of buckets used for prediction (pass from previous data)
# predictorVarName: variable name to be included in chart description
# returnData: boolean, should the confusion matrices be returned?
# Confusion data based on original target variable
if(isTRUE(printConfOrig) | isTRUE(returnData)) {
dfConfOrig <- df %>%
group_by(across(all_of(c(tgtOrig)))) %>%
summarize(right=sum(n*correct), wrong=sum(n)-right, n=sum(n), .groups="drop") %>%
mutate(pctRight=right/n, pctNaive=n/(sum(n)), lift=pctRight/pctNaive-1)
}
# Confusion data based on predicted target variable
if(isTRUE(printConfPred) | isTRUE(returnData)) {
dfConfPred <- df %>%
group_by(across(all_of(c(tgtPred)))) %>%
summarize(right=sum(n*correct), wrong=sum(n)-right, n=sum(n), .groups="drop") %>%
mutate(pctRight=right/n)
}
# Overall confusion data
if(isTRUE(printConfOverall) | isTRUE(returnData)) {
maxNaive <- df %>%
group_by(across(all_of(tgtOrig))) %>%
summarize(n=sum(n), .groups="drop") %>%
arrange(desc(n)) %>%
slice(1) %>%
pull(n)
dfConfOverall <- df %>%
summarize(right=sum(n*correct), wrong=sum(n)-right, n=sum(n), .groups="drop") %>%
mutate(maxN=maxNaive, pctRight=right/n, pctNaive=maxN/n, lift=pctRight/pctNaive-1, nBucket=nBucket)
}
# Confusion report based on original target variable
if(isTRUE(printConfOrig)) {
cat("\nConfusion data based on original target variable:", tgtOrig, "\n")
dfConfOrig %>%
print(n=50)
}
# Confusion report based on predicted target variable
if(isTRUE(printConfPred)) {
cat("\nConfusion data based on predicted target variable:", tgtPred, "\n")
dfConfPred %>%
print(n=50)
}
# Overall confusion matrix
if(isTRUE(printConfOverall)) {
cat("\nOverall confusion matrix\n")
dfConfOverall %>%
print(n=50)
}
# Plot of overlaps
if(isTRUE(plotConf)) {
p1 <- df %>%
group_by(across(all_of(c(tgtOrig, tgtPred, "correct")))) %>%
summarize(n=sum(n), .groups="drop") %>%
ggplot(aes(x=get(tgtOrig), y=get(tgtPred))) +
labs(x="Actual",
y="Predicted",
title=paste0(plotDesc, "Actual vs. predicted ", tgtOrig),
subtitle=paste0("(using ", predictorVarName, ")")
) +
geom_text(aes(label=n)) +
geom_tile(aes(fill=correct), alpha=0.25)
print(p1)
}
# Return data if requested
if(isTRUE(returnData)) list(dfConfOrig=dfConfOrig, dfConfPred=dfConfPred, dfConfOverall=dfConfOverall)
}
# Process for chaining predictor, applier, and confusion matrix for categorical variables
simpleOneVarChain <- function(df,
tgt,
prd,
mapper=NULL,
rankType="last",
naMethod=TRUE,
printReport=TRUE,
plotDesc="",
returnData=TRUE,
includeConfData=FALSE
) {
# FUNCTION ARGUMENTS:
# df: data frame or tibble with key elements (training or testing data set)
# tgt: target variable
# prd: predictor variable
# mapper: mapping file to be applied for predictions (NULL means create from simpleOneVarApply())
# rankType: method for breaking ties of same n, passed to base::rank as ties.method=
# naMethod: method for handling NA in ranks, passed to base::rank as na.last=
# printReport: boolean, should the confusion report data and plot be printed?
# plotDesc: descriptive label to be included in front of plot title
# returnData: boolean, should data elements be returned?
# includeConfData: boolean, should confusion data be returned?
# Create the summary of predictor-target-n
dfFit <- simpleOneVarFit(df, tgt=tgt, prd=prd, rankType=rankType, naMethod=naMethod)
# Create the mapper if it does not already exist
if(is.null(mapper)) mapper <- simpleOneVarMapper(dfFit, tgt=tgt, prd=prd)
# Apply mapper to data
dfApplied <- simpleOneVarApplyMapper(dfFit, tgt=tgt, prd=prd, mapper=mapper)
# Create confusion data
dfConfusion <- simpleOneVarConfusionData(dfApplied, tgtOrig=tgt, tgtPred="predicted")
# Create confusion report if requested
if(isTRUE(printReport) | isTRUE(includeConfData)) {
dfConfReport <- simpleOneVarConfusionReport(df=dfConfusion,
tgtOrig=tgt,
tgtPred="predicted",
nBucket=length(unique(dfApplied[[prd]])),
predictorVarName=prd,
printConf=printReport,
plotConf=printReport,
plotDesc=plotDesc,
returnData=includeConfData
)
}
# Return data if requested
if(isTRUE(returnData)) {
ret <- list(dfFit=dfFit, mapper=mapper, dfApplied=dfApplied, dfConfusion=dfConfusion)
if(isTRUE(includeConfData)) ret<-c(ret, list(dfConfData=dfConfReport))
ret
}
}
# Adds a train-test component for single variable predictions
simpleOneVarTrainTest <- function(dfTrain,
dfTest,
tgt,
prd,
rankType="last",
naMethod=TRUE,
printReport=FALSE,
includeConfData=TRUE,
returnData=TRUE
) {
# FUNCTION ARGUMENTS:
# dfTrain: data frame or tibble with key elements (training data set)
# dfTest: data frame or tibble with key elements (testing data set)
# tgt: target variable
# prd: predictor variable
# rankType: method for breaking ties of same n, passed to base::rank as ties.method=
# naMethod: method for handling NA in ranks, passed to base::rank as na.last=
# printReport: boolean, should the confusion report data and plot be printed?
# includeConfData: boolean, should confusion data be returned?
# returnData: boolean, should data elements be returned?
# Fit the training data
tmpTrain <- simpleOneVarChain(df=dfTrain,
tgt=tgt,
prd=prd,
rankType=rankType,
naMethod=naMethod,
printReport=printReport,
plotDesc="Training data: ",
returnData=TRUE,
includeConfData=includeConfData
)
# Fit the testing data
tmpTest <- simpleOneVarChain(df=dfTest,
tgt=tgt,
prd=prd,
mapper=tmpTrain$mapper,
rankType=rankType,
naMethod=naMethod,
printReport=printReport,
plotDesc="Testing data: ",
returnData=TRUE,
includeConfData=includeConfData
)
# Return data if requested
if(isTRUE(returnData)) list(tmpTrain=tmpTrain, tmpTest=tmpTest)
}
# Plot the means by cluster and variable for a k-means object
plotClusterMeans <- function(km, nrow=NULL, ncol=NULL, scales="fixed") {
# FUNCTION ARGUMENTS
# km: object returned by stats::kmeans(...)
# nrow: number of rows for faceting (NULL means default)
# ncol: number of columns for faceting (NULL means default)
# scales: passed to facet_wrap as scales=scales
# Assess clustering by dimension
p1 <- km$centers %>%
tibble::as_tibble() %>%
mutate(cluster=row_number()) %>%
pivot_longer(cols=-c(cluster)) %>%
ggplot(aes(x=fct_reorder(name,
value,
.fun=function(a) ifelse(length(a)==2, a[2]-a[1], diff(range(a)))
),
y=value
)
) +
geom_point(aes(color=factor(cluster))) +
scale_color_discrete("Cluster") +
facet_wrap(~factor(cluster), nrow=nrow, ncol=ncol, scales=scales) +
labs(title=paste0("Cluster means (kmeans, centers=", nrow(km$centers), ")"),
x="Metric",
y="Cluster mean"
) +
geom_hline(yintercept=median(km$centers), lty=2) +
coord_flip()
print(p1)
}
# Plot percentage by cluster
plotClusterPct <- function(df, km, keyVars, nRowFacet=1, printPlot=TRUE) {
# FUNCTION ARGUMENTS:
# df: data frame initially passed to stats::kmeans(...)
# km: object returned by stats::kmeans(...)
# keyVars: character vector of length 1 (y-only, x will be cl) or length 2 (x, y, cl will facet)
# nRowFacet: number of rows for facetting (only relevant if length(keyVars) is 2)
# printPlot: boolean, should plot be printed? (if not true, plot will be returned)
# Check length of keyVars
if(!(length(keyVars) %in% c(1, 2))) stop("\nArgument keyVars must be length-1 or length-2\n")
p1 <- df %>%
mutate(cl=factor(km$cluster)) %>%
group_by(across(c(all_of(keyVars), "cl"))) %>%
summarize(n=n(), .groups="drop") %>%
group_by(across(all_of(keyVars))) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot() +
scale_fill_continuous(low="white", high="green") +
labs(title=paste0("Percentage by cluster (kmeans with ", nrow(km$centers), " centers)"),
x=ifelse(length(keyVars)==1, "Cluster", keyVars[1]),
y=ifelse(length(keyVars)==1, keyVars[1], keyVars[2])
)
if(length(keyVars)==1) p1 <- p1 + geom_tile(aes(fill=pct, x=cl, y=get(keyVars[1])))
if(length(keyVars)==2) {
p1 <- p1 +
geom_tile(aes(fill=pct, x=get(keyVars[1]), y=get(keyVars[2]))) +
facet_wrap(~cl, nrow=nRowFacet)
}
if(isTRUE(printPlot)) print(p1)
else return(p1)
}
# Run k-means (or use passed k-means object) and plot centers and percentages of observations
runKMeans <- function(df,
km=NULL,
vars=NULL,
centers=2,
nStart=1L,
iter.max=10L,
seed=NULL,
plotMeans=FALSE,
nrowMeans=NULL,
plotPct=NULL,
nrowPct=1,
returnKM=is.null(km)
) {
# FUNCTION ARGUMENTS:
# df: data frame for clustering
# km: k-means object (will shut off k-means processing and run as plot-only)
# vars: variables to be used for clustering (NULL means everything in df)
# centers: number of centers
# nStart: passed to kmeans
# iter.max: passed to kmeans
# seed: seed to be set (if NULL, no seed is set)
# plotMeans: boolean, plot variable means by cluster?
# nrowMeans: argument passed as nrow for faceting rows in plotClusterMeans() - NULL is default ggplot2
# plotPct: list of character vectors to be passed sequentially as keyVars to plotClusterPct()
# NULL means do not run
# pctByCluster=list(c("var1"), c("var2", "var3")) will run plotting twice
# nrowPct: argument for faceting number of rows in plotClusterPct()
# returnKM: boolean, should the k-means object be returned?
# Set seed if requested
if(!is.null(seed)) set.seed(seed)
# Get the variable names if passed as NULL
if(is.null(vars)) vars <- names(df)
# Run the k-means process if the object has not been passed
if(is.null(km)) {
km <- df %>%
select(all_of(vars)) %>%
kmeans(centers=centers, iter.max=iter.max, nstart=nStart)
}
# Assess clustering by dimension if requested
if(isTRUE(plotMeans)) plotClusterMeans(km, nrow=nrowMeans)
if(!is.null((plotPct)))
for(ctr in 1:length(plotPct))
plotClusterPct(df=df, km=km, keyVars=plotPct[[ctr]], nRowFacet=nrowPct)
# Return the k-means object
if(isTRUE(returnKM)) return(km)
}
# Assign points to closest center of a passed k-means object
assignKMeans <- function(km, df, returnAllDistanceData=FALSE) {
# FUNCTION ARGUMENTS:
# km: a k-means object
# df: data frame or tibble
# returnAllDistanceData: boolean, should the distance data and clusters be returned?
# TRUE returns a data frame with distances as V1, V2, ..., and cluster as cl
# FALSE returns a vector of cluster assignments as integers
# Select columns from df to match km
df <- df %>% select(all_of(colnames(km$centers)))
if(!all.equal(names(df), colnames(km$centers))) stop("\nName mismatch in clustering and frame\n")
# Create the distances and find clusters
distClust <- sapply(seq_len(nrow(km$centers)),
FUN=function(x) sqrt(rowSums(sweep(as.matrix(df),
2,
t(as.matrix(km$centers[x,,drop=FALSE]))
)**2
)
)
) %>%
as.data.frame() %>%
tibble::as_tibble() %>%
mutate(cl=apply(., 1, which.min))
# Return the proper file
if(isTRUE(returnAllDistanceData)) return(distClust)
else return(distClust$cl)
}
As well, specific functions from _v002 are copied:
runSimpleRF <- function(df, yVar, xVars=NULL, ...) {
# FUNCTION ARGUMENTS:
# df: data frame containing observations
# yVar: variable to be predicted (numeric for regression, categorical for classification)
# xVars: predictor variables (NULL means everything in df except for yVar)
# ...: other arguments passed to ranger::ranger
# Create xVars if passed as NULL
if(is.null(xVars)) xVars <- setdiff(names(df), yVar)
# Simple random forest model
ranger::ranger(as.formula(paste0(yVar, "~", paste0(xVars, collapse="+"))),
data=df[, c(yVar, xVars)],
...
)
}
plotRFImportance <- function(rf,
impName="variable.importance",
divBy=1000,
plotTitle=NULL,
plotData=TRUE,
returnData=!isTRUE(plotData)
) {
# FUNCTION ARGUMENTS:
# rf: output list from random forest with an element for importance
# impName: name of the element to extract from rf
# divBy: divisor for the importance variable
# plotTitle: title for plot (NULL means use default)
# plotData: boolean, should the importance plot be created and printed?
# returnData: boolean, should the processed data be returned?
# Create title if not provided
if(is.null(plotTitle)) plotTitle <- "Importance for simple random forest"
# Create y-axis label
yAxisLabel="Variable Importance"
if(!isTRUE(all.equal(divBy, 1))) yAxisLabel <- paste0(yAxisLabel, " (", divBy, "s)")
# Create variable importance
df <- rf[[impName]] %>%
as.data.frame() %>%
purrr::set_names("imp") %>%
rownames_to_column("metric") %>%
tibble::as_tibble()
# Create and print plot if requested
if(isTRUE(plotData)) {
p1 <- df %>%
ggplot(aes(x=fct_reorder(metric, imp), y=imp/divBy)) +
geom_col(fill="lightblue") +
labs(x=NULL, y=yAxisLabel, title=plotTitle) +
coord_flip()
print(p1)
}
# Return data if requested
if(isTRUE(returnData)) return(df)
}
predictRF <- function(rf, df, newCol="pred", predsOnly=FALSE) {
# FUNCTION ARGUMENTS:
# rf: a trained random forest model
# df: data frame for adding predictions
# newCol: name for new column to be added to df
# predsOnly: boolean, should only the vector of predictions be returned?
# if FALSE, a column named newCol is added to df, with df returned
# Performance on holdout data
preds <- predict(rf, data=df)$predictions
# Return just the predictions if requested otherwise add as final column to df
if(isTRUE(predsOnly)) return(preds)
else {
df[newCol] <- preds
return(df)
}
}
# Update for continuous variables
reportAccuracy <- function(df,
trueCol,
predCol="pred",
reportAcc=TRUE,
rndReport=2,
useLabel="requested data",
returnAcc=!isTRUE(reportAcc),
reportR2=FALSE
) {
# FUNCTION ARGUMENTS:
# df: data frame containing actual and predictions
# trueCol: column containing true value
# predCol: column containing predicted value
# reportAcc: boolean, should accuracy be reported (printed to output)?
# rndReport: number of significant digits for reporting (will be converted to percentage first)
# useLabel: label for data to be used in reporting
# returnAcc: boolean, should the accuracy be returned
# return value is not converted to percentage, not rounded
# reportR2: boolean, should accuracy be calculated as R-squared?
# (default FALSE measures as categorical)
# Continuous or categorical reporting
if(isTRUE(reportR2)) {
tc <- df %>% pull(get(trueCol))
pc <- df %>% pull(get(predCol))
mseNull <- mean((tc-mean(tc))**2)
msePred <- mean((tc-pc)**2)
r2 <- 1 - msePred/mseNull
if(isTRUE(reportAcc))
cat("\nR-squared of ",
useLabel,
" is: ",
round(100*r2, rndReport),
"% (RMSE ",
round(sqrt(msePred), 2),
" vs. ",
round(sqrt(mseNull), 2),
" null)\n",
sep=""
)
acc <- c("mseNull"=mseNull, "msePred"=msePred, "r2"=r2)
} else {
acc <- mean(df[trueCol]==df[predCol])
if(isTRUE(reportAcc))
cat("\nAccuracy of ", useLabel, " is: ", round(100*acc, rndReport), "%\n", sep="")
}
# Return accuracy statistic if requested
if(isTRUE(returnAcc)) return(acc)
}
# Update for automated rounding
plotConfusion <- function(df,
trueCol,
predCol="pred",
useTitle=NULL,
useSub=NULL,
plotCont=FALSE,
rndTo=NULL,
rndBucketsAuto=100,
nSig=NULL,
refXY=FALSE
) {
# FUNCTION ARGUMENTS:
# df: data frame containing actual and predictions
# trueCol: column containing true value
# predCol: column containing predicted value
# useTitle: title to be used for chart (NULL means create from trueCol)
# useSub: subtitle to be used for chart (NULL means none)
# plotCont: boolean, should plotting assume continuous variables?
# (default FALSE assumes confusion plot for categorical variables)
# rndTo: every number in x should be rounded to the nearest rndTo
# NULL means no rounding (default)
# -1L means make an estimate based on data
# rndBucketsAuto: integer, if rndTo is -1L, about how many buckets are desired for predictions?
# nSig: number of significant digits for automatically calculated rounding parameter
# (NULL means calculate exactly)
# refXY: boolean, should a reference line for y=x be included? (relevant only for continuous)
# Create title if not supplied
if(is.null(useTitle)) useTitle <- paste0("Predicting ", trueCol)
# Function auto-round returns vector as-is when rndTo is NULL and auto-rounds when rndTo is -1L
df <- df %>%
mutate(across(all_of(c(trueCol, predCol)),
.fns=function(x) autoRound(x, rndTo=rndTo, rndBucketsAuto=rndBucketsAuto, nSig=nSig)
)
)
# Create base plot (applicable to categorical or continuous variables)
# Use x as true and y as predicted, for more meaningful geom_smooth() if continuous
# Flip coordinates if categorical
p1 <- df %>%
group_by(across(all_of(c(trueCol, predCol)))) %>%
summarize(n=n(), .groups="drop") %>%
ggplot(aes(y=get(predCol), x=get(trueCol))) +
labs(y="Predicted", x="Actual", title=useTitle, subtitle=useSub)
# Update plot as appropriate
if(isTRUE(plotCont)) {
p1 <- p1 +
geom_point(aes(size=n), alpha=0.5) +
scale_size_continuous("# Obs") +
geom_smooth(aes(weight=n), method="lm")
if(isTRUE(refXY)) p1 <- p1 + geom_abline(slope=1, intercept=0, lty=2, color="red")
} else {
p1 <- p1 +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=2.5) +
coord_flip() +
scale_fill_continuous("", low="white", high="green")
}
# Output plot
print(p1)
}
runFullRF <- function(dfTrain,
yVar,
xVars,
dfTest=dfTrain,
useLabel="test data",
useSub=NULL,
isContVar=FALSE,
rndTo=NULL,
rndBucketsAuto=100,
nSig=NULL,
refXY=FALSE,
makePlots=TRUE,
plotImp=makePlots,
plotConf=makePlots,
returnData=FALSE,
...
) {
# FUNCTION ARGUMENTS:
# dfTrain: training data
# yVar: dependent variable
# xVars: column(s) containing independent variables
# dfTest: test dataset for applying predictions
# useLabel: label to be used for reporting accuracy
# useSub: subtitle to be used for confusion chart (NULL means none)
# isContVar: boolean, is the variable continuous? (default FALSE means categorical)
# rndTo: every number in x should be rounded to the nearest rndTo
# NULL means no rounding (default)
# -1L means make an estimate based on data
# rndBucketsAuto: integer, if rndTo is -1L, about how many buckets are desired for predictions?
# nSig: number of significant digits for automatically calculated rounding parameter
# (NULL means calculate exactly)
# refXY: boolean, should a reference line for y=x be included? (relevant only for continuous)
# makePlots: boolean, should plots be created for variable importance and confusion matrix?
# plotImp: boolean, should variable importance be plotted? (default is makePlots)
# plotConf: boolean, should confusion matrix be plotted? (default is makePlots)
# returnData: boolean, should data be returned?
# ...: additional parameters to pass to runSimpleRF(), which are then passed to ranger::ranger()
# 1. Run random forest using impurity for importance
rf <- runSimpleRF(df=dfTrain, yVar=yVar, xVars=xVars, importance="impurity", ...)
# 2. Create, and optionally plot, variable importance
rfImp <- plotRFImportance(rf, plotData=plotImp, returnData=TRUE)
# 3. Predict on test dataset
tstPred <- predictRF(rf=rf, df=dfTest)
# 4. Report on accuracy (updated for continuous or categorical)
rfAcc <- reportAccuracy(tstPred,
trueCol=yVar,
rndReport=3,
useLabel=useLabel,
reportR2=isTRUE(isContVar),
returnAcc=TRUE
)
# 5. Plot confusion data (updated for continuous vs. categorical) if requested
if(isTRUE(plotConf)) {
plotConfusion(tstPred,
trueCol=yVar,
useSub=useSub,
plotCont=isTRUE(isContVar),
rndTo=rndTo,
rndBucketsAuto=rndBucketsAuto,
nSig=nSig,
refXY=refXY
)
}
#6. Return data if requested
if(isTRUE(returnData)) return(list(rf=rf, rfImp=rfImp, tstPred=tstPred, rfAcc=rfAcc))
}
runPartialImportanceRF <- function(dfTrain,
yVar,
dfTest,
impDB=dfImp,
nImp=+Inf,
otherX=c(),
isContVar=TRUE,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
rndTo=NULL,
rndBucketsAuto=50,
nSig=NULL,
refXY=FALSE,
makePlots=FALSE,
returnElem=c("rfImp", "rfAcc")
) {
# FUNCTION ARGUMENTS
# dfTrain: training data
# yVar: y variable in dfTrain
# dfTest: test data
# impDB: tibble containing variable importance by dependent variable
# nImp: use the top nImp variables by variable importance
# otherX: include these additional x variables
# isContVar: boolean, is this a continuous variable (regression)? FALSE means classification
# useLabel: label for description
# useSub: label for plot
# rndTo: controls the rounding parameter for plots, passed to runFullRF
# (NULL means no rounding)
# -1L means make an estimate based on underlying data
# rndBucketsAuto: integer, if rndTo is -1L, about how many buckets are desired for predictions?
# nSig: number of significant digits for automatically calculated rounding parameter
# (NULL means calculate exactly)
# refXY: controls the reference line parameter for plots, passed to runFullRF
# makePlots: boolean, should plots be created?
# returnElem: character vector of list elements to be returned
runFullRF(dfTrain=dfTrain,
yVar=yVar,
xVars=unique(c(impDB %>% filter(n<=nImp, src==yVar) %>% pull(metric), otherX)),
dfTest=dfTest,
isContVar = isContVar,
useLabel=useLabel,
useSub=useSub,
rndTo=rndTo,
rndBucketsAuto=rndBucketsAuto,
nSig=nSig,
refXY=refXY,
makePlots=makePlots,
returnData=TRUE
)[returnElem]
}
autoRound <- function(x, rndTo=-1L, rndBucketsAuto=100, nSig=NULL) {
# FUNCTION ARGUMENTS
# x: vector to be rounded
# rndTo: every number in x should be rounded to the nearest rndTo
# NULL means no rounding
# -1L means make an estimate based on data (default)
# rndBucketsAuto: integer, if rndTo is -1L, about how many buckets are desired for predictions?
# nSig: number of significant digits for automatically calculated rounding parameter
# (NULL means calculate exactly)
# If rndTo is passed as NULL, return x as-is
if(is.null(rndTo)) return(x)
# If rndTo is passed as -1L, make an estimate for rndTo
if(isTRUE(all.equal(-1L, rndTo))) {
# Get the number of unique values in x
nUq <- length(unique(x))
# If the number of unique values is no more than 150% of rndToBucketsAuto, return as-is
if(nUq <= 1.5*rndBucketsAuto) return(x)
# Otherwise, calculate a value for rndTo
rndTo <- diff(range(x)) / rndBucketsAuto
# Truncate to requested number of significant digits
if(!is.null(nSig)) rndTo <- signif(rndTo, digits=nSig)
}
# Return the rounded vector if it was not already returned
return(round(x/rndTo)*rndTo)
}
Key mapping tables for available metrics are also copied:
hourlyMetrics <- "temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm"
dailyMetrics <- "weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration"
hourlyDescription <- "Air temperature at 2 meters above ground\nRelative humidity at 2 meters above ground\nDew point temperature at 2 meters above ground\nApparent temperature is the perceived feels-like temperature combining wind chill factor, relative humidity and solar radiation\nAtmospheric air pressure reduced to mean sea level (msl) or pressure at surface. Typically pressure on mean sea level is used in meteorology. Surface pressure gets lower with increasing elevation.\nAtmospheric air pressure reduced to mean sea level (msl) or pressure at surface. Typically pressure on mean sea level is used in meteorology. Surface pressure gets lower with increasing elevation.\nTotal precipitation (rain, showers, snow) sum of the preceding hour. Data is stored with a 0.1 mm precision. If precipitation data is summed up to monthly sums, there might be small inconsistencies with the total precipitation amount.\nOnly liquid precipitation of the preceding hour including local showers and rain from large scale systems.\nSnowfall amount of the preceding hour in centimeters. For the water equivalent in millimeter, divide by 7. E.g. 7 cm snow = 10 mm precipitation water equivalent\nTotal cloud cover as an area fraction\nLow level clouds and fog up to 2 km altitude\nMid level clouds from 2 to 6 km altitude\nHigh level clouds from 6 km altitude\nShortwave solar radiation as average of the preceding hour. This is equal to the total global horizontal irradiation\nDirect solar radiation as average of the preceding hour on the horizontal plane and the normal plane (perpendicular to the sun)\nDirect solar radiation as average of the preceding hour on the horizontal plane and the normal plane (perpendicular to the sun)\nDiffuse solar radiation as average of the preceding hour\nWind speed at 10 or 100 meters above ground. Wind speed on 10 meters is the standard level.\nWind speed at 10 or 100 meters above ground. Wind speed on 10 meters is the standard level.\nWind direction at 10 or 100 meters above ground\nWind direction at 10 or 100 meters above ground\nGusts at 10 meters above ground of the indicated hour. Wind gusts in CERRA are defined as the maximum wind gusts of the preceding hour. Please consult the ECMWF IFS documentation for more information on how wind gusts are parameterized in weather models.\nET0 Reference Evapotranspiration of a well watered grass field. Based on FAO-56 Penman-Monteith equations ET0 is calculated from temperature, wind speed, humidity and solar radiation. Unlimited soil water is assumed. ET0 is commonly used to estimate the required irrigation for plants.\nWeather condition as a numeric code. Follow WMO weather interpretation codes. See table below for details. Weather code is calculated from cloud cover analysis, precipitation and snowfall. As barely no information about atmospheric stability is available, estimation about thunderstorms is not possible.\nVapor Pressure Deificit (VPD) in kilopascal (kPa). For high VPD (>1.6), water transpiration of plants increases. For low VPD (<0.4), transpiration decreases\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths."
dailyDescription <- "The most severe weather condition on a given day\nMaximum and minimum daily air temperature at 2 meters above ground\nMaximum and minimum daily air temperature at 2 meters above ground\nMaximum and minimum daily apparent temperature\nMaximum and minimum daily apparent temperature\nSum of daily precipitation (including rain, showers and snowfall)\nSum of daily rain\nSum of daily snowfall\nThe number of hours with rain\nSun rise and set times\nSun rise and set times\nMaximum wind speed and gusts on a day\nMaximum wind speed and gusts on a day\nDominant wind direction\nThe sum of solar radiaion on a given day in Megajoules\nDaily sum of ET0 Reference Evapotranspiration of a well watered grass field"
# Create tibble for hourly metrics
tblMetricsHourly <- tibble::tibble(metric=hourlyMetrics %>% str_split_1(","),
description=hourlyDescription %>% str_split_1("\n")
)
tblMetricsHourly %>%
print(n=50)
## # A tibble: 33 × 2
## metric description
## <chr> <chr>
## 1 temperature_2m Air temperature at 2 meters above ground
## 2 relativehumidity_2m Relative humidity at 2 meters above ground
## 3 dewpoint_2m Dew point temperature at 2 meters above ground
## 4 apparent_temperature Apparent temperature is the perceived feels-li…
## 5 pressure_msl Atmospheric air pressure reduced to mean sea l…
## 6 surface_pressure Atmospheric air pressure reduced to mean sea l…
## 7 precipitation Total precipitation (rain, showers, snow) sum …
## 8 rain Only liquid precipitation of the preceding hou…
## 9 snowfall Snowfall amount of the preceding hour in centi…
## 10 cloudcover Total cloud cover as an area fraction
## 11 cloudcover_low Low level clouds and fog up to 2 km altitude
## 12 cloudcover_mid Mid level clouds from 2 to 6 km altitude
## 13 cloudcover_high High level clouds from 6 km altitude
## 14 shortwave_radiation Shortwave solar radiation as average of the pr…
## 15 direct_radiation Direct solar radiation as average of the prece…
## 16 direct_normal_irradiance Direct solar radiation as average of the prece…
## 17 diffuse_radiation Diffuse solar radiation as average of the prec…
## 18 windspeed_10m Wind speed at 10 or 100 meters above ground. W…
## 19 windspeed_100m Wind speed at 10 or 100 meters above ground. W…
## 20 winddirection_10m Wind direction at 10 or 100 meters above ground
## 21 winddirection_100m Wind direction at 10 or 100 meters above ground
## 22 windgusts_10m Gusts at 10 meters above ground of the indicat…
## 23 et0_fao_evapotranspiration ET0 Reference Evapotranspiration of a well wat…
## 24 weathercode Weather condition as a numeric code. Follow WM…
## 25 vapor_pressure_deficit Vapor Pressure Deificit (VPD) in kilopascal (k…
## 26 soil_temperature_0_to_7cm Average temperature of different soil levels b…
## 27 soil_temperature_7_to_28cm Average temperature of different soil levels b…
## 28 soil_temperature_28_to_100cm Average temperature of different soil levels b…
## 29 soil_temperature_100_to_255cm Average temperature of different soil levels b…
## 30 soil_moisture_0_to_7cm Average soil water content as volumetric mixin…
## 31 soil_moisture_7_to_28cm Average soil water content as volumetric mixin…
## 32 soil_moisture_28_to_100cm Average soil water content as volumetric mixin…
## 33 soil_moisture_100_to_255cm Average soil water content as volumetric mixin…
# Create tibble for daily metrics
tblMetricsDaily <- tibble::tibble(metric=dailyMetrics %>% str_split_1(","),
description=dailyDescription %>% str_split_1("\n")
)
tblMetricsDaily
## # A tibble: 16 × 2
## metric description
## <chr> <chr>
## 1 weathercode The most severe weather condition on a given day
## 2 temperature_2m_max Maximum and minimum daily air temperature at 2 me…
## 3 temperature_2m_min Maximum and minimum daily air temperature at 2 me…
## 4 apparent_temperature_max Maximum and minimum daily apparent temperature
## 5 apparent_temperature_min Maximum and minimum daily apparent temperature
## 6 precipitation_sum Sum of daily precipitation (including rain, showe…
## 7 rain_sum Sum of daily rain
## 8 snowfall_sum Sum of daily snowfall
## 9 precipitation_hours The number of hours with rain
## 10 sunrise Sun rise and set times
## 11 sunset Sun rise and set times
## 12 windspeed_10m_max Maximum wind speed and gusts on a day
## 13 windgusts_10m_max Maximum wind speed and gusts on a day
## 14 winddirection_10m_dominant Dominant wind direction
## 15 shortwave_radiation_sum The sum of solar radiaion on a given day in Megaj…
## 16 et0_fao_evapotranspiration Daily sum of ET0 Reference Evapotranspiration of …
Core datasets (previously downloaded) are loaded, with explanatory variables added for future processing:
# Read daily JSON file
nycOMDaily <- readOpenMeteoJSON("testOM_daily_nyc.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, daily_units, daily
nycOMDaily
## $tblDaily
## # A tibble: 4,914 × 18
## date time weathercode temperature_2m_max temperature_2m_min
## <date> <chr> <int> <dbl> <dbl>
## 1 2010-01-01 2010-01-01 73 5 -1.4
## 2 2010-01-02 2010-01-02 71 -0.6 -9.2
## 3 2010-01-03 2010-01-03 71 -4.8 -10
## 4 2010-01-04 2010-01-04 1 -0.8 -7.3
## 5 2010-01-05 2010-01-05 1 -0.2 -7.3
## 6 2010-01-06 2010-01-06 2 1.1 -5.3
## 7 2010-01-07 2010-01-07 2 3.6 -3.7
## 8 2010-01-08 2010-01-08 71 1.9 -5.7
## 9 2010-01-09 2010-01-09 0 -1.4 -7.7
## 10 2010-01-10 2010-01-10 0 -1.7 -10.3
## # ℹ 4,904 more rows
## # ℹ 13 more variables: apparent_temperature_max <dbl>,
## # apparent_temperature_min <dbl>, precipitation_sum <dbl>, rain_sum <dbl>,
## # snowfall_sum <dbl>, precipitation_hours <dbl>, sunrise <chr>, sunset <chr>,
## # windspeed_10m_max <dbl>, windgusts_10m_max <dbl>,
## # winddirection_10m_dominant <int>, shortwave_radiation_sum <dbl>,
## # et0_fao_evapotranspiration <dbl>
##
## $tblHourly
## NULL
##
## $tblUnits
## # A tibble: 17 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 daily_units time "iso8601" <NA>
## 2 daily_units weathercode "wmo code" The most severe weather co…
## 3 daily_units temperature_2m_max "deg C" Maximum and minimum daily …
## 4 daily_units temperature_2m_min "deg C" Maximum and minimum daily …
## 5 daily_units apparent_temperature_max "deg C" Maximum and minimum daily …
## 6 daily_units apparent_temperature_min "deg C" Maximum and minimum daily …
## 7 daily_units precipitation_sum "mm" Sum of daily precipitation…
## 8 daily_units rain_sum "mm" Sum of daily rain
## 9 daily_units snowfall_sum "cm" Sum of daily snowfall
## 10 daily_units precipitation_hours "h" The number of hours with r…
## 11 daily_units sunrise "iso8601" Sun rise and set times
## 12 daily_units sunset "iso8601" Sun rise and set times
## 13 daily_units windspeed_10m_max "km/h" Maximum wind speed and gus…
## 14 daily_units windgusts_10m_max "km/h" Maximum wind speed and gus…
## 15 daily_units winddirection_10m_dominant "deg " Dominant wind direction
## 16 daily_units shortwave_radiation_sum "MJ/m²" The sum of solar radiaion …
## 17 daily_units et0_fao_evapotranspiration "mm" Daily sum of ET0 Reference…
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seconds timezone
## <dbl> <dbl> <dbl> <int> <chr>
## 1 40.7 -73.9 101. -14400 America/New_York
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
prettyOpenMeteoMeta(nycOMDaily)
##
## latitude: 40.7
## longitude: -73.9
## generationtime_ms: 100.914
## utc_offset_seconds: -14400
## timezone: America/New_York
## timezone_abbreviation: EDT
## elevation: 36
# Read hourly JSON file
nycOMHourly <- readOpenMeteoJSON("testOM_hourly_nyc.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly
nycOMHourly
## $tblDaily
## NULL
##
## $tblHourly
## # A tibble: 117,936 × 37
## time date hour temperature_2m relativehumidity_2m
## <dttm> <date> <int> <dbl> <int>
## 1 2010-01-01 00:00:00 2010-01-01 0 -1.1 95
## 2 2010-01-01 01:00:00 2010-01-01 1 -1 96
## 3 2010-01-01 02:00:00 2010-01-01 2 -1 96
## 4 2010-01-01 03:00:00 2010-01-01 3 -0.8 97
## 5 2010-01-01 04:00:00 2010-01-01 4 -0.9 97
## 6 2010-01-01 05:00:00 2010-01-01 5 -0.8 97
## 7 2010-01-01 06:00:00 2010-01-01 6 -0.7 97
## 8 2010-01-01 07:00:00 2010-01-01 7 -0.5 97
## 9 2010-01-01 08:00:00 2010-01-01 8 -0.6 97
## 10 2010-01-01 09:00:00 2010-01-01 9 -0.6 97
## # ℹ 117,926 more rows
## # ℹ 32 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## # pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## # rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## # cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## # direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## # diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
##
## $tblUnits
## # A tibble: 34 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 hourly_units time iso8601 <NA>
## 2 hourly_units temperature_2m deg C Air temperature at 2 meters above …
## 3 hourly_units relativehumidity_2m % Relative humidity at 2 meters abov…
## 4 hourly_units dewpoint_2m deg C Dew point temperature at 2 meters …
## 5 hourly_units apparent_temperature deg C Apparent temperature is the percei…
## 6 hourly_units pressure_msl hPa Atmospheric air pressure reduced t…
## 7 hourly_units surface_pressure hPa Atmospheric air pressure reduced t…
## 8 hourly_units precipitation mm Total precipitation (rain, showers…
## 9 hourly_units rain mm Only liquid precipitation of the p…
## 10 hourly_units snowfall cm Snowfall amount of the preceding h…
## # ℹ 24 more rows
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seconds timezone
## <dbl> <dbl> <dbl> <int> <chr>
## 1 40.7 -73.9 118. -14400 America/New_York
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
prettyOpenMeteoMeta(nycOMHourly)
##
## latitude: 40.7
## longitude: -73.9
## generationtime_ms: 118.0021
## utc_offset_seconds: -14400
## timezone: America/New_York
## timezone_abbreviation: EDT
## elevation: 36
# Create percentiles for numeric variables
nycTemp <- nycOMHourly$tblHourly %>%
mutate(year=year(date),
month=factor(month.abb[lubridate::month(date)], levels=month.abb),
hour=lubridate::hour(time),
fct_hour=factor(hour),
tod=ifelse(hour>=7 & hour<=18, "Day", "Night"),
season=case_when(month %in% c("Mar", "Apr", "May") ~ "Spring",
month %in% c("Jun", "Jul", "Aug") ~ "Summer",
month %in% c("Sep", "Oct", "Nov") ~ "Fall",
month %in% c("Dec", "Jan", "Feb") ~ "Winter",
TRUE~"typo"
),
todSeason=paste0(season, "-", tod),
tod=factor(tod, levels=c("Day", "Night")),
season=factor(season, levels=c("Spring", "Summer", "Fall", "Winter")),
todSeason=factor(todSeason,
levels=paste0(rep(c("Spring", "Summer", "Fall", "Winter"), each=2),
"-",
c("Day", "Night")
)
),
across(where(is.numeric), .fns=function(x) round(100*percent_rank(x)), .names="pct_{.col}")
)
glimpse(nycTemp)
## Rows: 117,936
## Columns: 78
## $ time <dttm> 2010-01-01 00:00:00, 2010-01-01 01:…
## $ date <date> 2010-01-01, 2010-01-01, 2010-01-01,…
## $ hour <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ temperature_2m <dbl> -1.1, -1.0, -1.0, -0.8, -0.9, -0.8, …
## $ relativehumidity_2m <int> 95, 96, 96, 97, 97, 97, 97, 97, 97, …
## $ dewpoint_2m <dbl> -1.7, -1.6, -1.6, -1.2, -1.3, -1.2, …
## $ apparent_temperature <dbl> -3.9, -3.9, -3.9, -3.7, -3.7, -3.6, …
## $ pressure_msl <dbl> 1017.2, 1016.5, 1015.9, 1015.6, 1015…
## $ surface_pressure <dbl> 1012.6, 1011.9, 1011.3, 1011.0, 1011…
## $ precipitation <dbl> 0.5, 0.5, 0.4, 0.3, 0.1, 0.0, 0.0, 0…
## $ rain <dbl> 0.0, 0.1, 0.1, 0.1, 0.0, 0.0, 0.0, 0…
## $ snowfall <dbl> 0.35, 0.28, 0.21, 0.14, 0.07, 0.00, …
## $ cloudcover <int> 90, 93, 80, 68, 71, 100, 100, 100, 1…
## $ cloudcover_low <int> 2, 8, 3, 6, 15, 51, 99, 99, 96, 77, …
## $ cloudcover_mid <int> 98, 96, 99, 98, 95, 97, 98, 99, 94, …
## $ cloudcover_high <int> 97, 93, 59, 13, 0, 0, 0, 0, 0, 0, 0,…
## $ shortwave_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 53, 11…
## $ direct_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 20…
## $ direct_normal_irradiance <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
## $ diffuse_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 41, 93…
## $ windspeed_10m <dbl> 3.1, 3.5, 3.3, 3.9, 3.5, 3.4, 0.0, 1…
## $ windspeed_100m <dbl> 3.8, 3.1, 3.8, 4.7, 6.4, 5.7, 1.4, 1…
## $ winddirection_10m <int> 339, 336, 347, 338, 336, 342, 180, 2…
## $ winddirection_100m <int> 41, 21, 17, 356, 344, 342, 360, 217,…
## $ windgusts_10m <dbl> 9.0, 9.7, 10.1, 7.6, 7.6, 6.8, 5.4, …
## $ et0_fao_evapotranspiration <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, …
## $ weathercode <int> 73, 73, 73, 71, 71, 3, 3, 3, 3, 3, 3…
## $ vapor_pressure_deficit <dbl> 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, …
## $ soil_temperature_0_to_7cm <dbl> -0.7, -0.7, -0.7, -0.6, -0.6, -0.6, …
## $ soil_temperature_7_to_28cm <dbl> 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0…
## $ soil_temperature_28_to_100cm <dbl> 4.2, 4.2, 4.1, 4.1, 4.1, 4.1, 4.1, 4…
## $ soil_temperature_100_to_255cm <dbl> 10.6, 10.6, 10.6, 10.6, 10.6, 10.6, …
## $ soil_moisture_0_to_7cm <dbl> 0.373, 0.374, 0.376, 0.377, 0.377, 0…
## $ soil_moisture_7_to_28cm <dbl> 0.377, 0.377, 0.377, 0.377, 0.377, 0…
## $ soil_moisture_28_to_100cm <dbl> 0.413, 0.413, 0.413, 0.413, 0.413, 0…
## $ soil_moisture_100_to_255cm <dbl> 0.412, 0.412, 0.412, 0.412, 0.412, 0…
## $ origTime <chr> "2010-01-01T00:00", "2010-01-01T01:0…
## $ year <dbl> 2010, 2010, 2010, 2010, 2010, 2010, …
## $ month <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, J…
## $ fct_hour <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ tod <fct> Night, Night, Night, Night, Night, N…
## $ season <fct> Winter, Winter, Winter, Winter, Wint…
## $ todSeason <fct> Winter-Night, Winter-Night, Winter-N…
## $ pct_hour <dbl> 0, 4, 8, 13, 17, 21, 25, 29, 33, 38,…
## $ pct_temperature_2m <dbl> 10, 10, 10, 11, 11, 11, 11, 12, 11, …
## $ pct_relativehumidity_2m <dbl> 92, 94, 94, 96, 96, 96, 96, 96, 96, …
## $ pct_dewpoint_2m <dbl> 23, 24, 24, 25, 25, 25, 25, 25, 25, …
## $ pct_apparent_temperature <dbl> 15, 15, 15, 15, 15, 15, 17, 17, 16, …
## $ pct_pressure_msl <dbl> 53, 49, 46, 44, 44, 41, 38, 36, 37, …
## $ pct_surface_pressure <dbl> 51, 47, 44, 42, 42, 39, 36, 35, 36, …
## $ pct_precipitation <dbl> 93, 93, 92, 90, 86, 0, 0, 0, 0, 0, 0…
## $ pct_rain <dbl> 0, 87, 87, 87, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_snowfall <dbl> 99, 99, 99, 99, 98, 0, 0, 0, 0, 0, 0…
## $ pct_cloudcover <dbl> 77, 79, 72, 67, 68, 81, 81, 81, 81, …
## $ pct_cloudcover_low <dbl> 51, 60, 53, 58, 65, 77, 90, 90, 88, …
## $ pct_cloudcover_mid <dbl> 90, 89, 92, 90, 88, 89, 90, 92, 87, …
## $ pct_cloudcover_high <dbl> 81, 76, 63, 49, 0, 0, 0, 0, 0, 0, 0,…
## $ pct_shortwave_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 49, 57, 6…
## $ pct_direct_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 62…
## $ pct_direct_normal_irradiance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 61, 61…
## $ pct_diffuse_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 58, 7…
## $ pct_windspeed_10m <dbl> 3, 4, 3, 5, 4, 4, 0, 1, 2, 5, 8, 8, …
## $ pct_windspeed_100m <dbl> 2, 1, 2, 3, 6, 5, 0, 0, 4, 9, 9, 8, …
## $ pct_winddirection_10m <dbl> 94, 93, 96, 94, 93, 95, 35, 43, 53, …
## $ pct_winddirection_100m <dbl> 8, 4, 3, 99, 96, 95, 100, 46, 51, 61…
## $ pct_windgusts_10m <dbl> 3, 4, 5, 1, 1, 1, 0, 0, 0, 1, 2, 4, …
## $ pct_et0_fao_evapotranspiration <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 32, 4…
## $ pct_weathercode <dbl> 99, 99, 99, 98, 98, 69, 69, 69, 69, …
## $ pct_vapor_pressure_deficit <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 8, …
## $ pct_soil_temperature_0_to_7cm <dbl> 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 9, 10,…
## $ pct_soil_temperature_7_to_28cm <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, …
## $ pct_soil_temperature_28_to_100cm <dbl> 16, 16, 15, 15, 15, 15, 15, 15, 15, …
## $ pct_soil_temperature_100_to_255cm <dbl> 42, 42, 42, 42, 42, 42, 42, 42, 42, …
## $ pct_soil_moisture_0_to_7cm <dbl> 70, 71, 73, 74, 74, 74, 74, 74, 73, …
## $ pct_soil_moisture_7_to_28cm <dbl> 69, 69, 69, 69, 69, 68, 68, 68, 68, …
## $ pct_soil_moisture_28_to_100cm <dbl> 96, 96, 96, 96, 96, 96, 96, 96, 96, …
## $ pct_soil_moisture_100_to_255cm <dbl> 96, 96, 96, 96, 96, 96, 96, 96, 96, …
## $ pct_year <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
nycTemp %>%
count(year, month) %>%
ggplot(aes(y=factor(year), x=month)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=3) +
scale_fill_continuous("# Records", low="white", high="green") +
labs(title="Records by year and month", x=NULL, y=NULL)
nycTemp %>% count(todSeason, season, tod)
## # A tibble: 8 × 4
## todSeason season tod n
## <fct> <fct> <fct> <int>
## 1 Spring-Day Spring Day 15456
## 2 Spring-Night Spring Night 15456
## 3 Summer-Day Summer Day 14532
## 4 Summer-Night Summer Night 14532
## 5 Fall-Day Fall Day 14196
## 6 Fall-Night Fall Night 14196
## 7 Winter-Day Winter Day 14784
## 8 Winter-Night Winter Night 14784
nycTemp %>% count(hour, fct_hour, tod) %>% print(n=30)
## # A tibble: 24 × 4
## hour fct_hour tod n
## <int> <fct> <fct> <int>
## 1 0 0 Night 4914
## 2 1 1 Night 4914
## 3 2 2 Night 4914
## 4 3 3 Night 4914
## 5 4 4 Night 4914
## 6 5 5 Night 4914
## 7 6 6 Night 4914
## 8 7 7 Day 4914
## 9 8 8 Day 4914
## 10 9 9 Day 4914
## 11 10 10 Day 4914
## 12 11 11 Day 4914
## 13 12 12 Day 4914
## 14 13 13 Day 4914
## 15 14 14 Day 4914
## 16 15 15 Day 4914
## 17 16 16 Day 4914
## 18 17 17 Day 4914
## 19 18 18 Day 4914
## 20 19 19 Night 4914
## 21 20 20 Night 4914
## 22 21 21 Night 4914
## 23 22 22 Night 4914
## 24 23 23 Night 4914
nycTemp %>% count(month, season)
## # A tibble: 12 × 3
## month season n
## <fct> <fct> <int>
## 1 Jan Winter 10416
## 2 Feb Winter 9480
## 3 Mar Spring 10416
## 4 Apr Spring 10080
## 5 May Spring 10416
## 6 Jun Summer 9720
## 7 Jul Summer 9672
## 8 Aug Summer 9672
## 9 Sep Fall 9360
## 10 Oct Fall 9672
## 11 Nov Fall 9360
## 12 Dec Winter 9672
# Add random variables to dataset, then split in to test and train
set.seed(24020416)
nycTempRand <- nycTemp %>%
mutate(pct_0005=sample(0:5, size=nrow(.), replace=TRUE),
pct_0025=sample(0:25, size=nrow(.), replace=TRUE),
pct_0100=sample(0:100, size=nrow(.), replace=TRUE),
pct_0250=sample(0:250, size=nrow(.), replace=TRUE),
pct_0500=sample(0:500, size=nrow(.), replace=TRUE),
pct_1000=sample(0:1000, size=nrow(.), replace=TRUE),
pct_2500=sample(0:2500, size=nrow(.), replace=TRUE),
pct_5000=sample(0:5000, size=nrow(.), replace=TRUE)
)
# Split in to test and train data (3:1 split in favor of test)
idxTrain <- sort(sample(1:nrow(nycTempRand), size=round(0.75*nrow(nycTempRand)), replace=FALSE))
nycTempTrain <- nycTempRand[idxTrain, ]
nycTempTest <- nycTempRand[-idxTrain, ]
Holdout data are created from a succeeding year, and the function is tested on categorical variable month:
# Create holdout data and labels
dfTrain <- filter(nycTempTrain, lubridate::year(date)<2022) %>% mutate(doy=yday(date))
dfTest <- filter(bind_rows(nycTempTest, nycTempTrain), lubridate::year(date)==2022) %>% mutate(doy=yday(date))
keyLabel <- "predictions based on pre-2022 training data applied to 2022 holdout dataset"
# Create set of relevant training variables
varsTrain <- nycTempTrain %>%
select(starts_with("pct")) %>%
select(-pct_hour, -pct_weathercode, -pct_year, -ends_with("0"), -ends_with("5")) %>%
names()
varsTrain
## [1] "pct_temperature_2m" "pct_relativehumidity_2m"
## [3] "pct_dewpoint_2m" "pct_apparent_temperature"
## [5] "pct_pressure_msl" "pct_surface_pressure"
## [7] "pct_precipitation" "pct_rain"
## [9] "pct_snowfall" "pct_cloudcover"
## [11] "pct_cloudcover_low" "pct_cloudcover_mid"
## [13] "pct_cloudcover_high" "pct_shortwave_radiation"
## [15] "pct_direct_radiation" "pct_direct_normal_irradiance"
## [17] "pct_diffuse_radiation" "pct_windspeed_10m"
## [19] "pct_windspeed_100m" "pct_winddirection_10m"
## [21] "pct_winddirection_100m" "pct_windgusts_10m"
## [23] "pct_et0_fao_evapotranspiration" "pct_vapor_pressure_deficit"
## [25] "pct_soil_temperature_0_to_7cm" "pct_soil_temperature_7_to_28cm"
## [27] "pct_soil_temperature_28_to_100cm" "pct_soil_temperature_100_to_255cm"
## [29] "pct_soil_moisture_0_to_7cm" "pct_soil_moisture_7_to_28cm"
## [31] "pct_soil_moisture_28_to_100cm" "pct_soil_moisture_100_to_255cm"
rfMonth <- runFullRF(dfTrain=dfTrain,
yVar="month",
xVars=varsTrain,
dfTest=dfTest,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
returnData=TRUE
)
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.774%
The function is tested on continuous variable temperature:
rfTemp2m <- runFullRF(dfTrain=dfTrain,
yVar="temperature_2m",
xVars=c(varsTrain[!str_detect(varsTrain, "pct_temp|apparent")], "month", "tod", "doy"),
dfTest=dfTest,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=TRUE,
rndTo=-1L,
refXY=TRUE,
returnData=TRUE
)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.239% (RMSE 0.9 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'
The function is tested on continuous variable soil temperature:
rfSoil255 <- runFullRF(dfTrain=dfTrain,
yVar="soil_temperature_100_to_255cm",
xVars=c(varsTrain[!str_detect(varsTrain, "pct_soil_temp")], "month", "tod", "doy"),
dfTest=dfTest,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=TRUE,
rndTo=-1L,
refXY=TRUE,
returnData=TRUE
)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.55% (RMSE 0.91 vs. 5.84 null)
## `geom_smooth()` using formula = 'y ~ x'
Variable importance is extracted, and cumulative variance explained is explored:
dfImp <- map_dfr(list("month"=rfMonth,
"temperature_2m"=rfTemp2m,
"soil_temperature_100_to_255cm"=rfSoil255
),
.f=function(x) x$rfImp,
.id="src"
) %>%
arrange(src, -imp) %>%
group_by(src) %>%
mutate(pct=imp/sum(imp), cspct=cumsum(pct), n=row_number()) %>%
ungroup()
dfImp
## # A tibble: 96 × 6
## src metric imp pct cspct n
## <chr> <chr> <dbl> <dbl> <dbl> <int>
## 1 month pct_soil_temperature_100_to_255cm 19868. 0.275 0.275 1
## 2 month pct_soil_temperature_28_to_100cm 12497. 0.173 0.448 2
## 3 month pct_soil_moisture_100_to_255cm 7197. 0.0996 0.547 3
## 4 month pct_soil_temperature_7_to_28cm 6355. 0.0879 0.635 4
## 5 month pct_soil_moisture_28_to_100cm 4874. 0.0674 0.703 5
## 6 month pct_soil_temperature_0_to_7cm 2780. 0.0385 0.741 6
## 7 month pct_soil_moisture_7_to_28cm 2655. 0.0367 0.778 7
## 8 month pct_soil_moisture_0_to_7cm 2013. 0.0279 0.806 8
## 9 month pct_apparent_temperature 1967. 0.0272 0.833 9
## 10 month pct_temperature_2m 1652. 0.0229 0.856 10
## # ℹ 86 more rows
dfImp %>%
select(src, n, cspct) %>%
bind_rows(group_by(., src) %>% filter(n==1) %>% mutate(n=0, cspct=0) %>% ungroup) %>%
ggplot(aes(x=n, y=cspct)) +
geom_line(aes(group=src, color=src)) +
labs(x="# Variables", y="Cumulative %", title="Cumulative Variance Explained vs. # Variables") +
scale_color_discrete("Dependent\nVariable")
Month is predicted using only the four top importance variables:
runPartialImportanceRF(dfTrain=dfTrain,
yVar="month",
dfTest=dfTest,
isContVar=FALSE,
impDB=dfImp,
nImp=4,
makePlots=TRUE
)
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.014%
## $rfImp
## # A tibble: 4 × 2
## metric imp
## <chr> <dbl>
## 1 pct_soil_temperature_100_to_255cm 31139.
## 2 pct_soil_temperature_28_to_100cm 20356.
## 3 pct_soil_moisture_100_to_255cm 10427.
## 4 pct_soil_temperature_7_to_28cm 9470.
##
## $rfAcc
## [1] 0.730137
Prediction accuracy is decreased by ~10% as predictors are limited to 4. A series of models are run, using a variable number of predictors:
# Variable importance number of variables to explore
impNums <- c(1:10, 16, 25, nrow(filter(dfImp, src=="month")))
# Accuracy on holdout data
rpiMonth <- tibble::tibble(nImp=impNums,
rfAcc=sapply(impNums,
FUN=function(x) runPartialImportanceRF(dfTrain=dfTrain,
yVar="month",
dfTest=dfTest,
isContVar=FALSE,
impDB=dfImp,
nImp=x,
makePlots=FALSE)[["rfAcc"]]
)
)
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.443%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.765%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.47%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.9%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.068%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 81.724%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.934%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 81.256%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.785%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.249%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.922%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.037%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.39%
rpiMonth
## # A tibble: 13 × 2
## nImp rfAcc
## <dbl> <dbl>
## 1 1 0.424
## 2 2 0.758
## 3 3 0.735
## 4 4 0.729
## 5 5 0.801
## 6 6 0.817
## 7 7 0.829
## 8 8 0.813
## 9 9 0.828
## 10 10 0.822
## 11 16 0.829
## 12 25 0.830
## 13 32 0.834
# Plot of holdout accuracy vs. number of variables
rpiMonth %>%
bind_rows(tibble::tibble(nImp=0, rfAcc=0)) %>%
ggplot(aes(x=nImp, y=rfAcc)) +
geom_line() +
geom_point() +
labs(title="Accuracy on holdout data vs. number of predictors",
subtitle="Predicting month",
y="Accuracy on holdout data",
x="# Predictors (selected in order of variable importance in full model)"
) +
lims(y=c(0, 1)) +
geom_hline(data=~filter(., rfAcc==max(rfAcc)), aes(yintercept=rfAcc), lty=2)
# Correlations
dfTrain %>%
select(all_of(varsTrain)) %>%
cor() %>%
as.data.frame() %>%
rownames_to_column("V1") %>%
tibble::tibble() %>%
pivot_longer(cols=-c(V1), names_to="V2") %>%
filter(V1 %in% pull(filter(dfImp, src=="month", n<=8), "metric"),
V2 %in% pull(filter(dfImp, src=="month", n<=8), "metric")
) %>%
ggplot(aes(x=fct_rev(V1), y=V2)) +
geom_tile(aes(fill=value)) +
geom_text(aes(label=round(value, 2))) +
scale_fill_gradient2(high="green") +
labs(title="Correlations of select predictors in training data", x=NULL, y=NULL)
Accuracy on holdout data is not monotonically increasing with number of predictors (sorted by original variable importance). Maximum accuracy is reached with ~7 predictors. Filtering or transforming to account for correlated predictors could be merited
Temperature is predicted using only the four top importance variables:
runPartialImportanceRF(dfTrain=dfTrain,
yVar="temperature_2m",
dfTest=dfTest,
isContVar=TRUE,
impDB=dfImp,
nImp=4,
makePlots=TRUE,
rndTo=-1L,
refXY=TRUE
)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.395% (RMSE 1.31 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'
## $rfImp
## # A tibble: 4 × 2
## metric imp
## <chr> <dbl>
## 1 pct_soil_temperature_0_to_7cm 4222109.
## 2 pct_dewpoint_2m 1433530.
## 3 pct_soil_temperature_7_to_28cm 2347242.
## 4 pct_soil_temperature_28_to_100cm 58720.
##
## $rfAcc
## mseNull msePred r2
## 106.1501502 1.7033478 0.9839534
A series of models are run, using a variable number of predictors:
# Variable importance number of variables to explore
impNums <- c(1:10, 16, 25, nrow(filter(dfImp, src=="temperature_2m")))
# Accuracy on holdout data
rpiTemp <- tibble::tibble(nImp=impNums,
r2=sapply(impNums,
FUN=function(x) runPartialImportanceRF(dfTrain=dfTrain,
yVar="temperature_2m",
dfTest=dfTest,
isContVar=TRUE,
impDB=dfImp,
nImp=x,
makePlots=FALSE)[["rfAcc"]]["r2"]
)
)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 96.137% (RMSE 2.02 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.079% (RMSE 1.43 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.268% (RMSE 1.36 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.4% (RMSE 1.3 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.501% (RMSE 1.26 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.823% (RMSE 0.43 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.74% (RMSE 0.53 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.657% (RMSE 0.6 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.799% (RMSE 0.46 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.769% (RMSE 0.5 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.767% (RMSE 0.5 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.488% (RMSE 0.74 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.238% (RMSE 0.9 vs. 10.3 null)
rpiTemp
## # A tibble: 13 × 2
## nImp r2
## <dbl> <dbl>
## 1 1 0.961
## 2 2 0.981
## 3 3 0.983
## 4 4 0.984
## 5 5 0.985
## 6 6 0.998
## 7 7 0.997
## 8 8 0.997
## 9 9 0.998
## 10 10 0.998
## 11 16 0.998
## 12 25 0.995
## 13 33 0.992
# Plot of holdout accuracy vs. number of variables
rpiTemp %>%
bind_rows(tibble::tibble(nImp=0, r2=0)) %>%
ggplot(aes(x=nImp, y=r2)) +
geom_line(data=~filter(., nImp>0)) +
geom_point(data=~filter(., nImp>0)) +
labs(title="R-squared on holdout data vs. number of predictors",
subtitle="Predicting temperature",
y="R-squared on holdout data",
x="# Predictors (selected in order of variable importance in full model)"
) +
lims(y=c(NA, 1)) +
geom_hline(data=~filter(., r2==max(r2)), aes(yintercept=r2), lty=2)
Soil temperature is predicted using only the four top importance variables:
runPartialImportanceRF(dfTrain=dfTrain,
yVar="soil_temperature_100_to_255cm",
dfTest=dfTest,
isContVar=TRUE,
impDB=dfImp,
nImp=4,
makePlots=TRUE,
rndTo=-1L,
refXY=TRUE
)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.013% (RMSE 1.01 vs. 5.84 null)
## `geom_smooth()` using formula = 'y ~ x'
## $rfImp
## # A tibble: 4 × 2
## metric imp
## <chr> <dbl>
## 1 doy 1369981.
## 2 month 956018.
## 3 pct_soil_moisture_100_to_255cm 250564.
## 4 pct_soil_moisture_28_to_100cm 51323.
##
## $rfAcc
## mseNull msePred r2
## 34.1488705 1.0199375 0.9701326
A series of models are run, using a variable number of predictors:
# Variable importance number of variables to explore
impNums <- c(1:10, 16, 25, nrow(filter(dfImp, src=="soil_temperature_100_to_255cm")))
# Accuracy on holdout data
rpiSoil <- tibble::tibble(nImp=impNums,
r2=sapply(impNums,
FUN=function(x) runPartialImportanceRF(dfTrain=dfTrain,
yVar="soil_temperature_100_to_255cm",
dfTest=dfTest,
isContVar=TRUE,
impDB=dfImp,
nImp=x,
makePlots=FALSE)[["rfAcc"]]["r2"]
)
)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.804% (RMSE 0.64 vs. 5.84 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.372% (RMSE 0.75 vs. 5.84 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.553% (RMSE 0.91 vs. 5.84 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.021% (RMSE 1.01 vs. 5.84 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.137% (RMSE 0.99 vs. 5.84 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.205% (RMSE 0.98 vs. 5.84 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.233% (RMSE 0.97 vs. 5.84 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.29% (RMSE 0.96 vs. 5.84 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.312% (RMSE 0.96 vs. 5.84 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.369% (RMSE 0.95 vs. 5.84 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.419% (RMSE 0.94 vs. 5.84 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.524% (RMSE 0.92 vs. 5.84 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.588% (RMSE 0.91 vs. 5.84 null)
rpiSoil
## # A tibble: 13 × 2
## nImp r2
## <dbl> <dbl>
## 1 1 0.988
## 2 2 0.984
## 3 3 0.976
## 4 4 0.970
## 5 5 0.971
## 6 6 0.972
## 7 7 0.972
## 8 8 0.973
## 9 9 0.973
## 10 10 0.974
## 11 16 0.974
## 12 25 0.975
## 13 31 0.976
# Plot of holdout accuracy vs. number of variables
rpiSoil %>%
bind_rows(tibble::tibble(nImp=0, r2=0)) %>%
ggplot(aes(x=nImp, y=r2)) +
geom_line(data=~filter(., nImp>0)) +
geom_point(data=~filter(., nImp>0)) +
labs(title="R-squared on holdout data vs. number of predictors",
subtitle="Predicting deep soil temperature",
y="R-squared on holdout data",
x="# Predictors (selected in order of variable importance in full model)"
) +
lims(y=c(NA, 1)) +
geom_hline(data=~filter(., r2==max(r2)), aes(yintercept=r2), lty=2)
Deep soil temperature is so seasonal that using the best predictor (day of year) drives slightly more accurate predictions than models using day of year and multiple other features
The function is tested on variable day of year, as an integer, without access to month:
rfDOYInt <- runFullRF(dfTrain=dfTrain,
yVar="doy",
xVars=c(varsTrain[!str_detect(varsTrain, "doy")], "tod"),
dfTest=dfTest,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=TRUE,
rndTo=1,
refXY=TRUE,
returnData=TRUE
)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.752% (RMSE 46.23 vs. 105.37 null)
## `geom_smooth()` using formula = 'y ~ x'
rfDOYInt$tstPred %>%
group_by(month) %>%
summarize(doyrmse=mean((pred-doy)**2)**0.5, n=n(), .groups="drop")
## # A tibble: 12 × 3
## month doyrmse n
## <fct> <dbl> <int>
## 1 Jan 146. 744
## 2 Feb 8.54 672
## 3 Mar 10.5 744
## 4 Apr 4.69 720
## 5 May 5.34 744
## 6 Jun 4.20 720
## 7 Jul 3.72 744
## 8 Aug 4.38 744
## 9 Sep 6.67 720
## 10 Oct 6.23 744
## 11 Nov 6.68 720
## 12 Dec 59.3 744
Not surprisingly, predictions for winter have high error, as there is little difference between day 360 and day 5. Predictions outside of winter are generally within 5-10 days, based primarily on soil characteristics
The function is tested on variable day of year, rounded to nearest 10 days and converted to factor, without access to month:
rfDOYFct <- runFullRF(dfTrain=dfTrain %>% filter(doy<=365) %>% mutate(doy=factor(10*round(doy/10))),
yVar="doy",
xVars=c(varsTrain[!str_detect(varsTrain, "doy")], "tod"),
dfTest=dfTest %>% filter(doy<=365) %>% mutate(doy=factor(10*round(doy/10))),
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=FALSE,
rndTo=NULL,
returnData=TRUE
)
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.648%
rfDOYFct$tstPred %>%
group_by(month) %>%
summarize(doyacc=mean(doy==pred),
doy10acc=mean(abs(as.integer(as.character(doy))-as.integer(as.character(pred))) %in% c(0, 10, 360)),
n=n(),
.groups="drop"
)
## # A tibble: 12 × 4
## month doyacc doy10acc n
## <fct> <dbl> <dbl> <int>
## 1 Jan 0.472 0.849 744
## 2 Feb 0.278 0.815 672
## 3 Mar 0.140 0.551 744
## 4 Apr 0.492 0.964 720
## 5 May 0.535 1 744
## 6 Jun 0.622 1 720
## 7 Jul 0.526 0.996 744
## 8 Aug 0.367 0.958 744
## 9 Sep 0.328 0.817 720
## 10 Oct 0.585 0.964 744
## 11 Nov 0.746 1 720
## 12 Dec 0.618 0.993 744
Soil temperature and moisture patterns generally make day of year predictions accurate to within 10-20 days
Day of year is predicted using only the four top importance variables:
dfImp <- map_dfr(list("month"=rfMonth,
"temperature_2m"=rfTemp2m,
"soil_temperature_100_to_255cm"=rfSoil255,
"doy"=rfDOYFct
),
.f=function(x) x$rfImp,
.id="src"
) %>%
arrange(src, -imp) %>%
group_by(src) %>%
mutate(pct=imp/sum(imp), cspct=cumsum(pct), n=row_number()) %>%
ungroup()
dfImp
## # A tibble: 129 × 6
## src metric imp pct cspct n
## <chr> <chr> <dbl> <dbl> <dbl> <int>
## 1 doy pct_soil_temperature_100_to_255cm 15156. 0.198 0.198 1
## 2 doy pct_soil_temperature_28_to_100cm 10393. 0.136 0.333 2
## 3 doy pct_soil_moisture_100_to_255cm 7681. 0.100 0.434 3
## 4 doy pct_soil_moisture_28_to_100cm 6656. 0.0869 0.521 4
## 5 doy pct_soil_temperature_7_to_28cm 4736. 0.0618 0.582 5
## 6 doy pct_soil_moisture_7_to_28cm 4630. 0.0604 0.643 6
## 7 doy pct_soil_moisture_0_to_7cm 3307. 0.0432 0.686 7
## 8 doy pct_pressure_msl 2171. 0.0283 0.714 8
## 9 doy pct_surface_pressure 2165. 0.0283 0.743 9
## 10 doy pct_dewpoint_2m 1987. 0.0259 0.768 10
## # ℹ 119 more rows
rfDOYpi <- runPartialImportanceRF(dfTrain=dfTrain %>% filter(doy<=365) %>% mutate(doy=factor(10*round(doy/10))),
yVar="doy",
dfTest=dfTest %>% filter(doy<=365) %>% mutate(doy=factor(10*round(doy/10))),
isContVar=FALSE,
impDB=dfImp,
nImp=4,
makePlots=TRUE,
returnElem=c("rfImp", "rfAcc", "tstPred")
)
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.534%
rfDOYpi$tstPred %>%
group_by(month) %>%
summarize(doyacc=mean(doy==pred),
doy10acc=mean(abs(as.integer(as.character(doy))-as.integer(as.character(pred))) %in% c(0, 10, 360)),
n=n(),
.groups="drop"
)
## # A tibble: 12 × 4
## month doyacc doy10acc n
## <fct> <dbl> <dbl> <int>
## 1 Jan 0.284 0.800 744
## 2 Feb 0.223 1 672
## 3 Mar 0.0927 0.364 744
## 4 Apr 0.647 0.928 720
## 5 May 0.781 1 744
## 6 Jun 0.583 1 720
## 7 Jul 0.446 1 744
## 8 Aug 0.0941 0.546 744
## 9 Sep 0.275 0.847 720
## 10 Oct 0.667 1 744
## 11 Nov 0.585 1 720
## 12 Dec 0.419 1 744
Restricting to the top 4 predictors still drives most predictions to within 10-20 days
A series of models are run, using a variable number of predictors:
# Variable importance number of variables to explore
impNums <- c(1:10, 16, 25, nrow(filter(dfImp, src=="doy")))
# Accuracy on holdout data
rpiDOY <- tibble::tibble(nImp=impNums,
rfAcc=sapply(impNums,
FUN=function(x) runPartialImportanceRF(dfTrain=dfTrain %>%
filter(doy<=365) %>%
mutate(doy=factor(10*round(doy/10))),
yVar="doy",
dfTest=dfTest %>%
filter(doy<=365) %>%
mutate(doy=factor(10*round(doy/10))),
isContVar=FALSE,
impDB=dfImp,
nImp=x,
makePlots=FALSE
)[["rfAcc"]]
)
)
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.043%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.226%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.836%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.007%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.365%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.091%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.221%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.032%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.651%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.854%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.082%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.116%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.927%
rpiDOY
## # A tibble: 13 × 2
## nImp rfAcc
## <dbl> <dbl>
## 1 1 0.270
## 2 2 0.422
## 3 3 0.438
## 4 4 0.440
## 5 5 0.454
## 6 6 0.451
## 7 7 0.462
## 8 8 0.470
## 9 9 0.457
## 10 10 0.479
## 11 16 0.481
## 12 25 0.481
## 13 33 0.489
# Plot of holdout accuracy vs. number of variables
rpiDOY %>%
bind_rows(tibble::tibble(nImp=0, rfAcc=0)) %>%
ggplot(aes(x=nImp, y=rfAcc)) +
geom_line() +
geom_point() +
labs(title="Accuracy on holdout data vs. number of predictors",
subtitle="Predicting day of year, rounded to nearest 10, as factor",
y="Accuracy on holdout data",
x="# Predictors (selected in order of variable importance in full model)"
) +
lims(y=c(0, 1)) +
geom_hline(data=~filter(., rfAcc==max(rfAcc)), aes(yintercept=rfAcc), lty=2)
Two soil temperature metrics (1.00m-2.55 m and 0.28m-1.00m) as a standalone drive accuracy nearly as high as including all 33 predictors
Soil metrics are especially predictive of time of year:
dfPlot <- nycTempTrain %>%
bind_rows(nycTempTest) %>%
arrange(time) %>%
mutate(doy=lubridate::yday(time))
dfPlot %>%
count(month, soil_temperature_28_to_100cm, soil_temperature_100_to_255cm) %>%
ggplot(aes(x=soil_temperature_28_to_100cm, y=soil_temperature_100_to_255cm)) +
geom_point(aes(color=month, size=n)) +
labs(title="Soil temperature readings by month and year")
dfPlot %>%
count(year, month, soil_temperature_28_to_100cm, soil_temperature_100_to_255cm) %>%
ggplot(aes(x=soil_temperature_28_to_100cm, y=soil_temperature_100_to_255cm)) +
geom_point(aes(color=month, size=n)) +
facet_wrap(~year) +
labs(title="Soil temperature readings by month and year")
dfPlot %>%
count(year, month, soil_temperature_28_to_100cm, soil_temperature_100_to_255cm) %>%
ggplot(aes(x=soil_temperature_28_to_100cm, y=soil_temperature_100_to_255cm)) +
geom_point(aes(color=factor(year), size=n)) +
facet_wrap(~month) +
labs(title="Soil temperature readings by month and year") +
scale_color_discrete(NULL)
dfPlot %>%
group_by(doy, year) %>%
summarize(across(.cols=c(soil_temperature_28_to_100cm, soil_temperature_100_to_255cm),
.fns=list(mean=mean, sd=sd)
),
.groups="drop"
) %>%
pivot_longer(cols=-c(year, doy)) %>%
ggplot(aes(x=doy, y=value)) +
geom_line(aes(group=year, color=factor(year))) +
facet_wrap(~name) +
scale_color_discrete(NULL) +
labs(title="Mean and sd for hourly soil temperature readings by day of year", x="Day of Year", y=NULL)
Deep soil temperatures are stable over extended time periods and display a repeatable, highly seasonal pattern. This makes them ideal predictors for metrics such as month or day of year
In contrast, air temperature and dewpoint show more intraday variability and less association with day of year:
dfPlotTemp <- dfPlot %>%
mutate(across(.cols=c(temperature_2m, dewpoint_2m), .fns=round))
dfPlotTemp %>%
count(month, temperature_2m, dewpoint_2m) %>%
ggplot(aes(x=temperature_2m, y=dewpoint_2m)) +
geom_point(aes(color=month, size=n)) +
labs(title="Temperature and dewpoint readings by month and year",
subtitle="Readings taken hourly, rounded to nearest 1 degree C"
)
dfPlotTemp %>%
count(year, month, temperature_2m, dewpoint_2m) %>%
ggplot(aes(x=temperature_2m, y=dewpoint_2m)) +
geom_point(aes(color=factor(year), size=n)) +
facet_wrap(~month) +
labs(title="Temperature and dewpoint readings by month and year",
subtitle="Readings taken hourly, rounded to nearest 1 degree C"
) +
scale_color_discrete(NULL)
dfPlotTemp %>%
group_by(doy, year) %>%
summarize(across(.cols=c(temperature_2m, dewpoint_2m),
.fns=list(mean=mean, sd=sd)
),
.groups="drop"
) %>%
pivot_longer(cols=-c(year, doy)) %>%
ggplot(aes(x=doy, y=value)) +
geom_line(aes(group=year, color=factor(year))) +
facet_wrap(~name) +
scale_color_discrete(NULL) +
labs(title="Mean and sd for hourly air temperature and dewpoint readings by day of year",
x="Day of Year",
y=NULL,
subtitle="Rounded to nearest 1 degree C"
)
dfPlotTemp %>%
group_by(doy, year) %>%
summarize(across(.cols=c(temperature_2m, dewpoint_2m),
.fns=list(mean=mean, sd=sd)
),
.groups="drop"
) %>%
pivot_longer(cols=-c(year, doy)) %>%
group_by(name) %>%
summarize(dailyMean=mean(value), dailySD=sd(value))
## # A tibble: 4 × 3
## name dailyMean dailySD
## <chr> <dbl> <dbl>
## 1 dewpoint_2m_mean 6.56 10.1
## 2 dewpoint_2m_sd 1.98 1.26
## 3 temperature_2m_mean 12.1 9.66
## 4 temperature_2m_sd 2.76 1.11
Soil moisture metrics are also explored:
dfPlot %>%
count(month, soil_moisture_28_to_100cm, soil_moisture_100_to_255cm) %>%
ggplot(aes(x=soil_moisture_28_to_100cm, y=soil_moisture_100_to_255cm)) +
geom_point(aes(color=month, size=n)) +
labs(title="Soil moisture readings by month and year")
dfPlot %>%
count(year, month, soil_moisture_28_to_100cm, soil_moisture_100_to_255cm) %>%
ggplot(aes(x=soil_moisture_28_to_100cm, y=soil_moisture_100_to_255cm)) +
geom_point(aes(color=month, size=n)) +
facet_wrap(~year) +
labs(title="Soil moisture readings by month and year")
dfPlot %>%
count(year, month, soil_moisture_28_to_100cm, soil_moisture_100_to_255cm) %>%
ggplot(aes(x=soil_moisture_28_to_100cm, y=soil_moisture_100_to_255cm)) +
geom_point(aes(color=factor(year), size=n)) +
facet_wrap(~month) +
labs(title="Soil moisture readings by month and year") +
scale_color_discrete(NULL)
dfPlot %>%
group_by(doy, year) %>%
summarize(across(.cols=c(soil_moisture_28_to_100cm, soil_moisture_100_to_255cm),
.fns=list(mean=mean, sd=sd)
),
.groups="drop"
) %>%
pivot_longer(cols=-c(year, doy)) %>%
ggplot(aes(x=doy, y=value)) +
geom_line(aes(group=year, color=factor(year))) +
facet_wrap(~name) +
scale_color_discrete(NULL) +
labs(title="Mean and sd for hourly soil moisture readings by day of year", x="Day of Year", y=NULL)
Deep soil moisture is less reproducibly seasonal, making it a less effective predictor of time of year
The function is tested on categorical variable year:
# Create holdout data and labels
dfTrain_v2 <- filter(nycTempTrain, lubridate::year(date)<2023) %>% mutate(doy=yday(date), fct_year=factor(year))
dfTest_v2 <- filter(nycTempTest, lubridate::year(date)<2023) %>% mutate(doy=yday(date), fct_year=factor(year))
keyLabel_v2 <- "predictions based on training data applied to holdout dataset"
rfYear <- runFullRF(dfTrain=dfTrain_v2,
yVar="fct_year",
xVars=varsTrain,
dfTest=dfTest_v2,
useLabel=keyLabel_v2,
useSub=stringr::str_to_sentence(keyLabel_v2),
returnData=TRUE
)
##
## Accuracy of predictions based on training data applied to holdout dataset is: 99.986%
dfImp <- map_dfr(list("month"=rfMonth,
"temperature_2m"=rfTemp2m,
"soil_temperature_100_to_255cm"=rfSoil255,
"doy"=rfDOYFct,
"fct_year"=rfYear
),
.f=function(x) x$rfImp,
.id="src"
) %>%
arrange(src, -imp) %>%
group_by(src) %>%
mutate(pct=imp/sum(imp), cspct=cumsum(pct), n=row_number()) %>%
ungroup()
dfImp
## # A tibble: 161 × 6
## src metric imp pct cspct n
## <chr> <chr> <dbl> <dbl> <dbl> <int>
## 1 doy pct_soil_temperature_100_to_255cm 15156. 0.198 0.198 1
## 2 doy pct_soil_temperature_28_to_100cm 10393. 0.136 0.333 2
## 3 doy pct_soil_moisture_100_to_255cm 7681. 0.100 0.434 3
## 4 doy pct_soil_moisture_28_to_100cm 6656. 0.0869 0.521 4
## 5 doy pct_soil_temperature_7_to_28cm 4736. 0.0618 0.582 5
## 6 doy pct_soil_moisture_7_to_28cm 4630. 0.0604 0.643 6
## 7 doy pct_soil_moisture_0_to_7cm 3307. 0.0432 0.686 7
## 8 doy pct_pressure_msl 2171. 0.0283 0.714 8
## 9 doy pct_surface_pressure 2165. 0.0283 0.743 9
## 10 doy pct_dewpoint_2m 1987. 0.0259 0.768 10
## # ℹ 151 more rows
There is both daily stability and annual variation in the combined metrics such that seeing ~70% of the hourly observations for each day is sufficient to determine the year
A series of models are run, using a variable number of predictors:
# Variable importance number of variables to explore
impNums <- c(1:10, 16, 25, nrow(filter(dfImp, src=="fct_year")))
# Accuracy on holdout data
rpiYear <- tibble::tibble(nImp=impNums,
rfAcc=sapply(impNums,
FUN=function(x) runPartialImportanceRF(dfTrain=dfTrain_v2,
yVar="fct_year",
dfTest=dfTest_v2,
isContVar=FALSE,
impDB=dfImp,
nImp=x,
makePlots=FALSE
)[["rfAcc"]]
)
)
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.196%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.684%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.515%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.716%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.937%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.972%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.972%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.982%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.982%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.979%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.986%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.989%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.982%
rpiYear
## # A tibble: 13 × 2
## nImp rfAcc
## <dbl> <dbl>
## 1 1 0.272
## 2 2 0.767
## 3 3 0.985
## 4 4 0.997
## 5 5 0.999
## 6 6 1.00
## 7 7 1.00
## 8 8 1.00
## 9 9 1.00
## 10 10 1.00
## 11 16 1.00
## 12 25 1.00
## 13 32 1.00
# Plot of holdout accuracy vs. number of variables
rpiYear %>%
bind_rows(tibble::tibble(nImp=0, rfAcc=0)) %>%
ggplot(aes(x=nImp, y=rfAcc)) +
geom_line() +
geom_point() +
labs(title="Accuracy on holdout data vs. number of predictors",
subtitle="Predicting year as factor",
y="Accuracy on holdout data",
x="# Predictors (selected in order of variable importance in full model)"
) +
lims(y=c(0, 1)) +
geom_hline(data=~filter(., rfAcc==max(rfAcc)), aes(yintercept=rfAcc), lty=2)
The function is tested on categorical variable hour:
# Create holdout data and labels
dfTrain_v3 <- filter(nycTempTrain, lubridate::year(date)<2022) %>% mutate(doy=yday(date))
dfTest_v3 <- filter(nycTempTest, lubridate::year(date)==2022) %>% mutate(doy=yday(date))
keyLabel_v3 <- "predictions based on training data applied to holdout dataset"
rfHour <- runFullRF(dfTrain=dfTrain_v3,
yVar="fct_hour",
xVars=c(varsTrain, "month", "doy"),
dfTest=dfTest_v3,
useLabel=keyLabel_v3,
useSub=stringr::str_to_sentence(keyLabel_v3),
returnData=TRUE
)
## Growing trees.. Progress: 92%. Estimated remaining time: 2 seconds.
##
## Accuracy of predictions based on training data applied to holdout dataset is: 41.281%
dfImp <- map_dfr(list("month"=rfMonth,
"temperature_2m"=rfTemp2m,
"soil_temperature_100_to_255cm"=rfSoil255,
"doy"=rfDOYFct,
"fct_year"=rfYear,
"fct_hour"=rfHour
),
.f=function(x) x$rfImp,
.id="src"
) %>%
arrange(src, -imp) %>%
group_by(src) %>%
mutate(pct=imp/sum(imp), cspct=cumsum(pct), n=row_number()) %>%
ungroup()
dfImp
## # A tibble: 195 × 6
## src metric imp pct cspct n
## <chr> <chr> <dbl> <dbl> <dbl> <int>
## 1 doy pct_soil_temperature_100_to_255cm 15156. 0.198 0.198 1
## 2 doy pct_soil_temperature_28_to_100cm 10393. 0.136 0.333 2
## 3 doy pct_soil_moisture_100_to_255cm 7681. 0.100 0.434 3
## 4 doy pct_soil_moisture_28_to_100cm 6656. 0.0869 0.521 4
## 5 doy pct_soil_temperature_7_to_28cm 4736. 0.0618 0.582 5
## 6 doy pct_soil_moisture_7_to_28cm 4630. 0.0604 0.643 6
## 7 doy pct_soil_moisture_0_to_7cm 3307. 0.0432 0.686 7
## 8 doy pct_pressure_msl 2171. 0.0283 0.714 8
## 9 doy pct_surface_pressure 2165. 0.0283 0.743 9
## 10 doy pct_dewpoint_2m 1987. 0.0259 0.768 10
## # ℹ 185 more rows
Hour (as factor) is predicted using only the four top importance variables:
runPartialImportanceRF(dfTrain=dfTrain_v3,
yVar="fct_hour",
dfTest=dfTest_v3,
isContVar=FALSE,
impDB=dfImp,
nImp=4,
makePlots=TRUE
)
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.027%
## $rfImp
## # A tibble: 4 × 2
## metric imp
## <chr> <dbl>
## 1 pct_diffuse_radiation 8767.
## 2 pct_shortwave_radiation 8763.
## 3 pct_vapor_pressure_deficit 8667.
## 4 doy 15474.
##
## $rfAcc
## [1] 0.3702665
Prediction accuracy is decreased by ~5% as predictors are limited to 4, with some of the nighttime hours never predicted. A series of models are run, using a variable number of predictors:
# Variable importance number of variables to explore
impNums <- c(1:10, 16, 25, nrow(filter(dfImp, src=="fct_hour")))
# Accuracy on holdout data
rpiHour <- tibble::tibble(nImp=impNums,
rfAcc=sapply(impNums,
FUN=function(x) runPartialImportanceRF(dfTrain=dfTrain_v3,
yVar="fct_hour",
dfTest=dfTest_v3,
isContVar=FALSE,
impDB=dfImp,
nImp=x,
makePlots=FALSE)[["rfAcc"]]
)
)
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.501%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.485%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.196%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.26%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.681%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.897%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.569%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.523%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.673%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.429%
## Growing trees.. Progress: 97%. Estimated remaining time: 1 seconds.
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.224%
## Growing trees.. Progress: 87%. Estimated remaining time: 4 seconds.
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.58%
## Growing trees.. Progress: 86%. Estimated remaining time: 4 seconds.
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.169%
rpiHour
## # A tibble: 13 × 2
## nImp rfAcc
## <dbl> <dbl>
## 1 1 0.115
## 2 2 0.175
## 3 3 0.202
## 4 4 0.373
## 5 5 0.377
## 6 6 0.389
## 7 7 0.386
## 8 8 0.385
## 9 9 0.407
## 10 10 0.384
## 11 16 0.392
## 12 25 0.406
## 13 34 0.422
# Plot of holdout accuracy vs. number of variables
rpiHour %>%
bind_rows(tibble::tibble(nImp=0, rfAcc=0)) %>%
ggplot(aes(x=nImp, y=rfAcc)) +
geom_line() +
geom_point() +
labs(title="Accuracy on holdout data vs. number of predictors",
subtitle="Predicting hour",
y="Accuracy on holdout data",
x="# Predictors (selected in order of variable importance in full model)"
) +
lims(y=c(0, 1)) +
geom_hline(data=~filter(., rfAcc==max(rfAcc)), aes(yintercept=rfAcc), lty=2)
# Correlations
dfTrain %>%
select(all_of(varsTrain)) %>%
cor() %>%
as.data.frame() %>%
rownames_to_column("V1") %>%
tibble::tibble() %>%
pivot_longer(cols=-c(V1), names_to="V2") %>%
filter(V1 %in% pull(filter(dfImp, src=="fct_hour", n<=8), "metric"),
V2 %in% pull(filter(dfImp, src=="fct_hour", n<=8), "metric")
) %>%
ggplot(aes(x=fct_rev(V1), y=V2)) +
geom_tile(aes(fill=value)) +
geom_text(aes(label=round(value, 2))) +
scale_fill_gradient2(high="green") +
labs(title="Correlations of select predictors in training data", x=NULL, y=NULL)
Accuracy generally increases with number of predictors, plateauing around 40% with 4+ predictors
The process for running partial importance is converted to functional form:
autoPartialImportance <- function(dfTrain,
dfTest,
yVar,
isContVar,
impDB=dfImp,
impNums=c(1:10, 16, 25, nrow(filter(dfImp, src==yVar)))
) {
# FUNCTION ARGUMENTS:
# dfTrain: training data
# dfTest: test (holdout) data
# yVar: dependent variable
# isContVar: boolean, is this a contnuous variable (R-2) or categorical variable (accuracy)?
# impDB: tibble containing sorted variable importances by predictor
# impNums: vector of number of variables to run (each element in vector run)
# Accuracy on holdout data
tblRPI <- tibble::tibble(nImp=impNums,
rfAcc=sapply(impNums,
FUN=function(x) {y <- runPartialImportanceRF(dfTrain=dfTrain,
yVar=yVar,
dfTest=dfTest,
isContVar=isContVar,
impDB=impDB,
nImp=x,
makePlots=FALSE
)[["rfAcc"]]
if(isTRUE(isContVar)) y <- y["r2"]
y
}
)
)
print(tblRPI)
# Plot of holdout accuracy/r-squared vs. number of variables
# if(isTRUE(isContVar)) tblRPI <- tblRPI %>% mutate(rfAcc=r2)
if(isTRUE(isContVar)) prtDesc <- "R-squared" else prtDesc <- "Accuracy"
p1 <- tblRPI %>%
select(nImp, rfAcc) %>%
bind_rows(tibble::tibble(nImp=0, rfAcc=0)) %>%
ggplot(aes(x=nImp, y=rfAcc)) +
geom_line() +
geom_point() +
labs(title=paste0(prtDesc, " on holdout data vs. number of predictors"),
subtitle=paste0("Predicting ", yVar),
y=paste0(prtDesc, " on holdout data"),
x="# Predictors (selected in order of variable importance in full model)"
) +
lims(y=c(0, 1)) +
geom_hline(data=~filter(., rfAcc==max(rfAcc)), aes(yintercept=rfAcc), lty=2)
print(p1)
return(tblRPI)
}
The function is tested on hour, as factor:
apiHour <- autoPartialImportance(dfTrain=dfTrain_v3, dfTest=dfTest_v3, yVar="fct_hour", isContVar=FALSE)
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.267%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.251%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.103%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.886%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.616%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.177%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.242%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.775%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.065%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.476%
## Growing trees.. Progress: 100%. Estimated remaining time: 0 seconds.
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.476%
## Growing trees.. Progress: 57%. Estimated remaining time: 23 seconds.
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.72%
## Growing trees.. Progress: 64%. Estimated remaining time: 17 seconds.
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.842%
## # A tibble: 13 × 2
## nImp rfAcc
## <dbl> <dbl>
## 1 1 0.113
## 2 2 0.173
## 3 3 0.201
## 4 4 0.369
## 5 5 0.386
## 6 6 0.392
## 7 7 0.382
## 8 8 0.378
## 9 9 0.401
## 10 10 0.385
## 11 16 0.385
## 12 25 0.407
## 13 34 0.418
apiHour %>%
colRenamer(c("rfAcc"="apiAcc")) %>%
full_join(rpiHour, by=c("nImp")) %>%
colRenamer(c("rfAcc"="rpiAcc")) %>%
pivot_longer(cols=-c(nImp)) %>%
ggplot(aes(x=nImp, y=value)) +
geom_point(aes(color=name)) +
geom_line(aes(group=name, color=name)) +
labs(title=paste0("Accuracy", " on holdout data vs. number of predictors"),
subtitle=paste0("Predicting ", "hour as factor", " using function api and previous results rpi"),
y=paste0("Accuracy", " on holdout data"),
x="# Predictors (selected in order of variable importance in full model)"
) +
lims(y=c(0, 1)) +
scale_color_discrete(NULL)
While there are minor differences due to different random states, the function broadly gives the same results as the previous code
The function is tested on temperature:
apiTemp <- autoPartialImportance(dfTrain=dfTrain, dfTest=dfTest, yVar="temperature_2m", isContVar=TRUE)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 96.135% (RMSE 2.03 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.081% (RMSE 1.43 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.266% (RMSE 1.36 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.397% (RMSE 1.3 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.498% (RMSE 1.26 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.822% (RMSE 0.43 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.743% (RMSE 0.52 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.663% (RMSE 0.6 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.79% (RMSE 0.47 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.77% (RMSE 0.49 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.766% (RMSE 0.5 vs. 10.3 null)
## Growing trees.. Progress: 100%. Estimated remaining time: 0 seconds.
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.465% (RMSE 0.75 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.259% (RMSE 0.89 vs. 10.3 null)
## # A tibble: 13 × 2
## nImp rfAcc
## <dbl> <dbl>
## 1 1 0.961
## 2 2 0.981
## 3 3 0.983
## 4 4 0.984
## 5 5 0.985
## 6 6 0.998
## 7 7 0.997
## 8 8 0.997
## 9 9 0.998
## 10 10 0.998
## 11 16 0.998
## 12 25 0.995
## 13 33 0.993
apiTemp %>%
colRenamer(c("rfAcc"="apiR2")) %>%
full_join(rpiTemp, by=c("nImp")) %>%
colRenamer(c("r2"="rpiR2")) %>%
pivot_longer(cols=-c(nImp)) %>%
ggplot(aes(x=nImp, y=value)) +
geom_point(aes(color=name)) +
geom_line(aes(group=name, color=name)) +
labs(title=paste0("R-squared", " on holdout data vs. number of predictors"),
subtitle=paste0("Predicting ", "temperature", " using function api and previous results rpi"),
y=paste0("R-squared", " on holdout data"),
x="# Predictors (selected in order of variable importance in full model)"
) +
lims(y=c(0, 1)) +
scale_color_discrete(NULL)
While there are minor differences due to different random states, the function broadly gives the same results as the previous code
The function is tested on categorical variable todSeason (a mix of day/night and the four seasons):
rfTODS <- runFullRF(dfTrain=dfTrain_v3,
yVar="todSeason",
xVars=c(varsTrain),
dfTest=dfTest_v3,
useLabel=keyLabel_v3,
useSub=stringr::str_to_sentence(keyLabel_v3),
returnData=TRUE
)
##
## Accuracy of predictions based on training data applied to holdout dataset is: 89.06%
dfImp <- map_dfr(list("month"=rfMonth,
"temperature_2m"=rfTemp2m,
"soil_temperature_100_to_255cm"=rfSoil255,
"doy"=rfDOYFct,
"fct_year"=rfYear,
"fct_hour"=rfHour,
"todSeason"=rfTODS
),
.f=function(x) x$rfImp,
.id="src"
) %>%
arrange(src, -imp) %>%
group_by(src) %>%
mutate(pct=imp/sum(imp), cspct=cumsum(pct), n=row_number()) %>%
ungroup()
dfImp
## # A tibble: 227 × 6
## src metric imp pct cspct n
## <chr> <chr> <dbl> <dbl> <dbl> <int>
## 1 doy pct_soil_temperature_100_to_255cm 15156. 0.198 0.198 1
## 2 doy pct_soil_temperature_28_to_100cm 10393. 0.136 0.333 2
## 3 doy pct_soil_moisture_100_to_255cm 7681. 0.100 0.434 3
## 4 doy pct_soil_moisture_28_to_100cm 6656. 0.0869 0.521 4
## 5 doy pct_soil_temperature_7_to_28cm 4736. 0.0618 0.582 5
## 6 doy pct_soil_moisture_7_to_28cm 4630. 0.0604 0.643 6
## 7 doy pct_soil_moisture_0_to_7cm 3307. 0.0432 0.686 7
## 8 doy pct_pressure_msl 2171. 0.0283 0.714 8
## 9 doy pct_surface_pressure 2165. 0.0283 0.743 9
## 10 doy pct_dewpoint_2m 1987. 0.0259 0.768 10
## # ℹ 217 more rows
Variable todSeason is predicted using only the four top importance variables:
runPartialImportanceRF(dfTrain=dfTrain_v3,
yVar="todSeason",
dfTest=dfTest_v3,
isContVar=FALSE,
impDB=dfImp,
nImp=4,
makePlots=TRUE
)
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.086%
## $rfImp
## # A tibble: 4 × 2
## metric imp
## <chr> <dbl>
## 1 pct_soil_temperature_100_to_255cm 16650.
## 2 pct_soil_temperature_28_to_100cm 12700.
## 3 pct_soil_temperature_7_to_28cm 13211.
## 4 pct_soil_moisture_100_to_255cm 7694.
##
## $rfAcc
## [1] 0.4408602
Prediction accuracy is decreased as predictors are limited to 4, with day/night especially impacted due to lack of a radiation variable. A series of models are run, using a variable number of predictors:
Variable todSeason is predicted using a range of predictors:
apiTODS <- autoPartialImportance(dfTrain=dfTrain_v3, dfTest=dfTest_v3, yVar="todSeason", isContVar=FALSE)
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.043%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.985%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.639%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.899%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.375%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 84.198%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 86.162%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 88.499%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 87.471%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 87.845%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 88.967%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.014%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 88.873%
## # A tibble: 13 × 2
## nImp rfAcc
## <dbl> <dbl>
## 1 1 0.310
## 2 2 0.470
## 3 3 0.476
## 4 4 0.439
## 5 5 0.824
## 6 6 0.842
## 7 7 0.862
## 8 8 0.885
## 9 9 0.875
## 10 10 0.878
## 11 16 0.890
## 12 25 0.890
## 13 32 0.889
Prediction accuracy soars when a soil temperature variable and a radiation variable are both included
Each possible predictor is run on a stand-alone basis:
tstOneVar <- sapply(varsTrain, FUN=function(x) {
runFullRF(dfTrain=dfTrain_v3,
yVar="todSeason",
xVars=x,
dfTest=dfTest_v3,
useLabel=keyLabel_v3,
useSub=stringr::str_to_sentence(keyLabel_v3),
makePlots=FALSE,
returnData=TRUE
)[["rfAcc"]]
}
)
##
## Accuracy of predictions based on training data applied to holdout dataset is: 37.868%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 17.672%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 27.583%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 35.624%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 18.28%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 19.355%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 13.417%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 13.417%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 13.511%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 19.308%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 16.269%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 20.757%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 16.082%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 30.201%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 27.489%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 28.237%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 31.744%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 16.316%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 17.625%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 12.529%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 13.978%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 15.708%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 29.032%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 28.612%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 39.645%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 34.923%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 30.108%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 31.136%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 28.284%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 24.638%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 23.142%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 32.726%
tstOneVar %>%
as.data.frame() %>%
purrr::set_names("rfAcc") %>%
rownames_to_column("pred") %>%
tibble::tibble() %>%
arrange(desc(rfAcc)) %>%
print(n=40)
## # A tibble: 32 × 2
## pred rfAcc
## <chr> <dbl>
## 1 pct_soil_temperature_0_to_7cm 0.396
## 2 pct_temperature_2m 0.379
## 3 pct_apparent_temperature 0.356
## 4 pct_soil_temperature_7_to_28cm 0.349
## 5 pct_soil_moisture_100_to_255cm 0.327
## 6 pct_diffuse_radiation 0.317
## 7 pct_soil_temperature_100_to_255cm 0.311
## 8 pct_shortwave_radiation 0.302
## 9 pct_soil_temperature_28_to_100cm 0.301
## 10 pct_et0_fao_evapotranspiration 0.290
## 11 pct_vapor_pressure_deficit 0.286
## 12 pct_soil_moisture_0_to_7cm 0.283
## 13 pct_direct_normal_irradiance 0.282
## 14 pct_dewpoint_2m 0.276
## 15 pct_direct_radiation 0.275
## 16 pct_soil_moisture_7_to_28cm 0.246
## 17 pct_soil_moisture_28_to_100cm 0.231
## 18 pct_cloudcover_mid 0.208
## 19 pct_surface_pressure 0.194
## 20 pct_cloudcover 0.193
## 21 pct_pressure_msl 0.183
## 22 pct_relativehumidity_2m 0.177
## 23 pct_windspeed_100m 0.176
## 24 pct_windspeed_10m 0.163
## 25 pct_cloudcover_low 0.163
## 26 pct_cloudcover_high 0.161
## 27 pct_windgusts_10m 0.157
## 28 pct_winddirection_100m 0.140
## 29 pct_snowfall 0.135
## 30 pct_precipitation 0.134
## 31 pct_rain 0.134
## 32 pct_winddirection_10m 0.125
Each possible predictor is run on a stand-alone basis, along with the best predictor:
bestOneVar <- (tstOneVar %>% sort(decreasing = TRUE) %>% names())[1]
bestOneVar
## [1] "pct_soil_temperature_0_to_7cm"
tstTwoVar <- sapply(setdiff(varsTrain, bestOneVar), FUN=function(x) {
runFullRF(dfTrain=dfTrain_v3,
yVar="todSeason",
xVars=c(x, bestOneVar),
dfTest=dfTest_v3,
useLabel=keyLabel_v3,
useSub=stringr::str_to_sentence(keyLabel_v3),
makePlots=FALSE,
returnData=TRUE
)[["rfAcc"]]
}
)
##
## Accuracy of predictions based on training data applied to holdout dataset is: 42.122%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 44.226%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 42.403%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 43.619%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 39.691%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 39.598%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 38.756%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 38.429%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 37.962%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 38.382%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 37.588%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 39.878%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 40.439%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 62.366%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 52.735%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 50.538%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 63.862%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 39.598%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 39.878%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 36.372%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 36.606%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 39.224%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 56.007%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 44.554%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 53.109%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 52.034%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 58.813%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 41%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 39.364%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 43.011%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 47.592%
tstTwoVar %>%
as.data.frame() %>%
purrr::set_names("rfAcc") %>%
rownames_to_column("pred") %>%
tibble::tibble() %>%
arrange(desc(rfAcc)) %>%
print(n=40)
## # A tibble: 31 × 2
## pred rfAcc
## <chr> <dbl>
## 1 pct_diffuse_radiation 0.639
## 2 pct_shortwave_radiation 0.624
## 3 pct_soil_temperature_100_to_255cm 0.588
## 4 pct_et0_fao_evapotranspiration 0.560
## 5 pct_soil_temperature_7_to_28cm 0.531
## 6 pct_direct_radiation 0.527
## 7 pct_soil_temperature_28_to_100cm 0.520
## 8 pct_direct_normal_irradiance 0.505
## 9 pct_soil_moisture_100_to_255cm 0.476
## 10 pct_vapor_pressure_deficit 0.446
## 11 pct_relativehumidity_2m 0.442
## 12 pct_apparent_temperature 0.436
## 13 pct_soil_moisture_28_to_100cm 0.430
## 14 pct_dewpoint_2m 0.424
## 15 pct_temperature_2m 0.421
## 16 pct_soil_moisture_0_to_7cm 0.410
## 17 pct_cloudcover_high 0.404
## 18 pct_cloudcover_mid 0.399
## 19 pct_windspeed_100m 0.399
## 20 pct_pressure_msl 0.397
## 21 pct_surface_pressure 0.396
## 22 pct_windspeed_10m 0.396
## 23 pct_soil_moisture_7_to_28cm 0.394
## 24 pct_windgusts_10m 0.392
## 25 pct_precipitation 0.388
## 26 pct_rain 0.384
## 27 pct_cloudcover 0.384
## 28 pct_snowfall 0.380
## 29 pct_cloudcover_low 0.376
## 30 pct_winddirection_100m 0.366
## 31 pct_winddirection_10m 0.364
Each possible predictor is run on a stand-alone basis, along with the best two predictors:
bestTwoVar <- (tstTwoVar %>% sort(decreasing = TRUE) %>% names())[1]
bestTwoVar
## [1] "pct_diffuse_radiation"
tstThreeVar <- sapply(setdiff(varsTrain, c(bestOneVar, bestTwoVar)), FUN=function(x) {
runFullRF(dfTrain=dfTrain_v3,
yVar="todSeason",
xVars=c(x, bestOneVar, bestTwoVar),
dfTest=dfTest_v3,
useLabel=keyLabel_v3,
useSub=stringr::str_to_sentence(keyLabel_v3),
makePlots=FALSE,
returnData=TRUE
)[["rfAcc"]]
}
)
##
## Accuracy of predictions based on training data applied to holdout dataset is: 63.768%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 62.319%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 64.049%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 63.16%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 65.217%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 64.61%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 62.412%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 56.709%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 62.085%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 63.207%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 61.431%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 64.329%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 64.002%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 56.148%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 56.101%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 56.054%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 61.992%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 62.786%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 60.402%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 61.664%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 63.347%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 61.571%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 62.973%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 65.965%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 75.129%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.32%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 63.815%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 64.002%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 65.965%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 77.653%
tstThreeVar %>%
as.data.frame() %>%
purrr::set_names("rfAcc") %>%
rownames_to_column("pred") %>%
tibble::tibble() %>%
arrange(desc(rfAcc)) %>%
print(n=40)
## # A tibble: 30 × 2
## pred rfAcc
## <chr> <dbl>
## 1 pct_soil_temperature_100_to_255cm 0.853
## 2 pct_soil_moisture_100_to_255cm 0.777
## 3 pct_soil_temperature_28_to_100cm 0.751
## 4 pct_soil_temperature_7_to_28cm 0.660
## 5 pct_soil_moisture_28_to_100cm 0.660
## 6 pct_pressure_msl 0.652
## 7 pct_surface_pressure 0.646
## 8 pct_cloudcover_mid 0.643
## 9 pct_dewpoint_2m 0.640
## 10 pct_cloudcover_high 0.640
## 11 pct_soil_moisture_7_to_28cm 0.640
## 12 pct_soil_moisture_0_to_7cm 0.638
## 13 pct_temperature_2m 0.638
## 14 pct_windgusts_10m 0.633
## 15 pct_cloudcover 0.632
## 16 pct_apparent_temperature 0.632
## 17 pct_vapor_pressure_deficit 0.630
## 18 pct_windspeed_100m 0.628
## 19 pct_precipitation 0.624
## 20 pct_relativehumidity_2m 0.623
## 21 pct_snowfall 0.621
## 22 pct_windspeed_10m 0.620
## 23 pct_winddirection_100m 0.617
## 24 pct_et0_fao_evapotranspiration 0.616
## 25 pct_cloudcover_low 0.614
## 26 pct_winddirection_10m 0.604
## 27 pct_rain 0.567
## 28 pct_shortwave_radiation 0.561
## 29 pct_direct_radiation 0.561
## 30 pct_direct_normal_irradiance 0.561
Each possible predictor is run on a stand-alone basis, along with the best three predictors:
bestThreeVar <- (tstThreeVar %>% sort(decreasing = TRUE) %>% names())[1]
bestThreeVar
## [1] "pct_soil_temperature_100_to_255cm"
tstFourVar <- sapply(setdiff(varsTrain, c(bestOneVar, bestTwoVar, bestThreeVar)), FUN=function(x) {
runFullRF(dfTrain=dfTrain_v3,
yVar="todSeason",
xVars=c(x, bestOneVar, bestTwoVar, bestThreeVar),
dfTest=dfTest_v3,
useLabel=keyLabel_v3,
useSub=stringr::str_to_sentence(keyLabel_v3),
makePlots=FALSE,
returnData=TRUE
)[["rfAcc"]]
}
)
##
## Accuracy of predictions based on training data applied to holdout dataset is: 84.712%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.227%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 84.572%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.04%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 84.338%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 84.105%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 84.479%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 84.666%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.04%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.554%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 84.479%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 84.666%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.881%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.928%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.788%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.554%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 83.871%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 84.105%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 83.918%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 83.918%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 84.525%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.647%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.367%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 86.863%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 86.255%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.507%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 86.068%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.273%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 77.84%
tstFourVar %>%
as.data.frame() %>%
purrr::set_names("rfAcc") %>%
rownames_to_column("pred") %>%
tibble::tibble() %>%
arrange(desc(rfAcc)) %>%
print(n=40)
## # A tibble: 29 × 2
## pred rfAcc
## <chr> <dbl>
## 1 pct_soil_temperature_7_to_28cm 0.869
## 2 pct_soil_temperature_28_to_100cm 0.863
## 3 pct_soil_moisture_7_to_28cm 0.861
## 4 pct_shortwave_radiation 0.859
## 5 pct_cloudcover_high 0.859
## 6 pct_direct_radiation 0.858
## 7 pct_et0_fao_evapotranspiration 0.856
## 8 pct_cloudcover 0.856
## 9 pct_direct_normal_irradiance 0.856
## 10 pct_soil_moisture_0_to_7cm 0.855
## 11 pct_vapor_pressure_deficit 0.854
## 12 pct_soil_moisture_28_to_100cm 0.853
## 13 pct_relativehumidity_2m 0.852
## 14 pct_apparent_temperature 0.850
## 15 pct_snowfall 0.850
## 16 pct_temperature_2m 0.847
## 17 pct_rain 0.847
## 18 pct_cloudcover_mid 0.847
## 19 pct_dewpoint_2m 0.846
## 20 pct_windgusts_10m 0.845
## 21 pct_precipitation 0.845
## 22 pct_cloudcover_low 0.845
## 23 pct_pressure_msl 0.843
## 24 pct_surface_pressure 0.841
## 25 pct_windspeed_100m 0.841
## 26 pct_winddirection_10m 0.839
## 27 pct_winddirection_100m 0.839
## 28 pct_windspeed_10m 0.839
## 29 pct_soil_moisture_100_to_255cm 0.778
The evolution in prediction accuracy is plotted:
tibble::tibble(x=0:4,
y=c(0, sapply(list(tstOneVar, tstTwoVar, tstThreeVar, tstFourVar), FUN=function(x) max(x)))
) %>%
bind_rows(apiTODS %>% filter(nImp==max(nImp)) %>% select(x=nImp, y=rfAcc)) %>%
ggplot(aes(x=x, y=y)) +
geom_point() +
geom_line() +
geom_text(aes(label=round(y, 2), y=ifelse(x>=4, y=y-0.05, y), x=ifelse(x>=4, x+1, x-0.5)),
hjust=1,
size=3
) +
geom_hline(aes(yintercept=max(y)), lty=2) +
lims(y=c(0, 1)) +
labs(x="# Predictors",
y="Accuracy on holdout data",
title="Accuracy of predicting day-night season by number of predictors"
)
The first three predictors each significantly improve accuracy. A soil temperature variable helps predict season, then a radiation variable helps predict day-night, then a second soil temperature variable significantly refines prediction of season. There is very little gain for 4+ predictors
The process is converted to functional form:
runNextBestPredictor <- function(varsRun,
xFix,
yVar,
isContVar,
dfTrain,
dfTest=dfTrain,
useLabel="predictions based on training data applied to holdout dataset",
useSub=stringr::str_to_sentence(keyLabel_v3),
makePlots=FALSE
) {
# FUNCTION ARGUMENTS:
# varsRun: variables to be run as potential next-best predictors
# xFix: variables that are already included in every test of next-best
# yVar: dependent variable of interest
# isContVar: boolean, is yvar continuous?
# dfTrain: training data
# dfTest: test data
# useLabel: descriptive label
# useSub: subtitle description
# makePlots: boolean, should plots be created for each predictor run?
vecAcc <- sapply(varsRun, FUN=function(x) {
y <- runFullRF(dfTrain=dfTrain,
yVar=yVar,
xVars=c(xFix, x),
dfTest=dfTest,
useLabel=useLabel,
useSub=useSub,
isContVar=isContVar,
makePlots=makePlots,
returnData=TRUE
)[["rfAcc"]]
if(isTRUE(isContVar)) y[["r2"]] else y
}
)
vecAcc %>%
as.data.frame() %>%
purrr::set_names("rfAcc") %>%
rownames_to_column("pred") %>%
tibble::tibble() %>%
arrange(desc(rfAcc)) %>%
print(n=40)
vecAcc
}
The function is tested for a categorical variable:
rnbpThree <- runNextBestPredictor(varsRun=(tstThreeVar %>% sort(decreasing=TRUE) %>% names),
xFix=c(bestOneVar, bestTwoVar),
yVar="todSeason",
isContVar=FALSE,
dfTrain=dfTrain_v3,
dfTest=dfTest_v3
)
##
## Accuracy of predictions based on training data applied to holdout dataset is: 85.273%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 77.84%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 75.316%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 65.685%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 65.778%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 65.124%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 64.843%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 64.236%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 63.955%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 64.002%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 63.207%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 64.376%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 63.581%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 62.786%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 63.534%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 63.067%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 62.366%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 62.973%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 60.776%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 62.506%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 60.449%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 62.319%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 61.337%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 61.805%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 60.823%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 60.168%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 57.363%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 55.961%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 56.101%
##
## Accuracy of predictions based on training data applied to holdout dataset is: 56.241%
## # A tibble: 30 × 2
## pred rfAcc
## <chr> <dbl>
## 1 pct_soil_temperature_100_to_255cm 0.853
## 2 pct_soil_moisture_100_to_255cm 0.778
## 3 pct_soil_temperature_28_to_100cm 0.753
## 4 pct_soil_moisture_28_to_100cm 0.658
## 5 pct_soil_temperature_7_to_28cm 0.657
## 6 pct_pressure_msl 0.651
## 7 pct_surface_pressure 0.648
## 8 pct_soil_moisture_0_to_7cm 0.644
## 9 pct_cloudcover_mid 0.642
## 10 pct_cloudcover_high 0.640
## 11 pct_dewpoint_2m 0.640
## 12 pct_temperature_2m 0.636
## 13 pct_cloudcover 0.635
## 14 pct_soil_moisture_7_to_28cm 0.632
## 15 pct_apparent_temperature 0.631
## 16 pct_windspeed_100m 0.630
## 17 pct_windgusts_10m 0.628
## 18 pct_relativehumidity_2m 0.625
## 19 pct_vapor_pressure_deficit 0.624
## 20 pct_windspeed_10m 0.623
## 21 pct_et0_fao_evapotranspiration 0.618
## 22 pct_winddirection_100m 0.613
## 23 pct_cloudcover_low 0.608
## 24 pct_precipitation 0.608
## 25 pct_snowfall 0.604
## 26 pct_winddirection_10m 0.602
## 27 pct_rain 0.574
## 28 pct_direct_normal_irradiance 0.562
## 29 pct_direct_radiation 0.561
## 30 pct_shortwave_radiation 0.560
Results are compared to the previous process, with only slight differences likely due to different random states observed:
rownames_to_column(as.data.frame(rnbpThree), "predictor") %>%
full_join(rownames_to_column(as.data.frame(tstThreeVar), "predictor"), by="predictor") %>%
tibble::tibble() %>%
pivot_longer(cols=-c(predictor)) %>%
ggplot(aes(x=fct_reorder(predictor, value), y=value)) +
geom_point(aes(color=c("rnbpThree"="New\nFunction", "tstThreeVar"="Previous\nResult")[name])) +
coord_flip() +
geom_hline(lty=2, yintercept=max(tstTwoVar)) +
labs(y="Accuracy",
x=NULL,
title="Accuracy for each predictor added standalone as next-best third predictor",
subtitle="Dashed line is baseline two-predictor accuracy"
) +
scale_color_discrete(NULL)
The function is tested for a continuous variable:
rnbpContOne <- runNextBestPredictor(varsRun=c(varsTrain[!str_detect(varsTrain, "pct_temp|apparent")],
"month",
"tod",
"doy"
),
xFix=c(),
yVar="temperature_2m",
isContVar=TRUE,
dfTrain=dfTrain,
dfTest=dfTest
)
##
## R-squared of predictions based on training data applied to holdout dataset is: -0.449% (RMSE 10.33 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 83.692% (RMSE 4.16 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 20.318% (RMSE 9.2 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 19.151% (RMSE 9.26 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 0.115% (RMSE 10.3 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 0.379% (RMSE 10.28 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 2.59% (RMSE 10.17 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 8.542% (RMSE 9.85 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 3.757% (RMSE 10.11 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 14.692% (RMSE 9.52 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 4.673% (RMSE 10.06 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 14.187% (RMSE 9.54 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 12.574% (RMSE 9.63 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 10.204% (RMSE 9.76 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 13.622% (RMSE 9.58 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 4.044% (RMSE 10.09 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 7.256% (RMSE 9.92 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: -11.051% (RMSE 10.86 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: -8.288% (RMSE 10.72 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 3.247% (RMSE 10.13 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 27.493% (RMSE 8.77 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 52.87% (RMSE 7.07 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.135% (RMSE 2.03 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 86.431% (RMSE 3.8 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 71.872% (RMSE 5.46 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 22.386% (RMSE 9.08 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 59.421% (RMSE 6.56 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 61.956% (RMSE 6.35 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 51.89% (RMSE 7.15 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 18.824% (RMSE 9.28 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 76.61% (RMSE 4.98 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 1.729% (RMSE 10.21 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 78.077% (RMSE 4.82 vs. 10.3 null)
## # A tibble: 33 × 2
## pred rfAcc
## <chr> <dbl>
## 1 pct_soil_temperature_0_to_7cm 0.961
## 2 pct_soil_temperature_7_to_28cm 0.864
## 3 pct_dewpoint_2m 0.837
## 4 doy 0.781
## 5 month 0.766
## 6 pct_soil_temperature_28_to_100cm 0.719
## 7 pct_soil_moisture_7_to_28cm 0.620
## 8 pct_soil_moisture_0_to_7cm 0.594
## 9 pct_vapor_pressure_deficit 0.529
## 10 pct_soil_moisture_28_to_100cm 0.519
## 11 pct_et0_fao_evapotranspiration 0.275
## 12 pct_soil_temperature_100_to_255cm 0.224
## 13 pct_pressure_msl 0.203
## 14 pct_surface_pressure 0.192
## 15 pct_soil_moisture_100_to_255cm 0.188
## 16 pct_cloudcover_mid 0.147
## 17 pct_shortwave_radiation 0.142
## 18 pct_diffuse_radiation 0.136
## 19 pct_direct_radiation 0.126
## 20 pct_direct_normal_irradiance 0.102
## 21 pct_cloudcover 0.0854
## 22 pct_windspeed_100m 0.0726
## 23 pct_cloudcover_high 0.0467
## 24 pct_windspeed_10m 0.0404
## 25 pct_cloudcover_low 0.0376
## 26 pct_windgusts_10m 0.0325
## 27 pct_snowfall 0.0259
## 28 tod 0.0173
## 29 pct_rain 0.00379
## 30 pct_precipitation 0.00115
## 31 pct_relativehumidity_2m -0.00449
## 32 pct_winddirection_100m -0.0829
## 33 pct_winddirection_10m -0.111
The best predictor for temperature is extracted, with a function written for reuse:
getNextBestVar <- function(x, returnTbl=FALSE, n=if(isTRUE(returnTbl)) +Inf else 1) {
# FUNCTION ARGUMENTS:
# x: named vector of accuracy or r-squared
# returnTbl: boolean, if TRUE convert to tibble and return, if FALSE return vector of top-n predictors
# n: number of predictrs to return (+Inf will return the full tibble or vector)
tbl <- vecToTibble(x, colNameName="pred") %>%
arrange(-value) %>%
slice_head(n=n)
if(isTRUE(returnTbl)) return(tbl)
else return(tbl %>% pull(pred))
}
getNextBestVar(rnbpContOne)
## [1] "pct_soil_temperature_0_to_7cm"
getNextBestVar(rnbpContOne, n=4)
## [1] "pct_soil_temperature_0_to_7cm" "pct_soil_temperature_7_to_28cm"
## [3] "pct_dewpoint_2m" "doy"
getNextBestVar(rnbpContOne, returnTbl=TRUE)
## # A tibble: 33 × 2
## pred value
## <chr> <dbl>
## 1 pct_soil_temperature_0_to_7cm 0.961
## 2 pct_soil_temperature_7_to_28cm 0.864
## 3 pct_dewpoint_2m 0.837
## 4 doy 0.781
## 5 month 0.766
## 6 pct_soil_temperature_28_to_100cm 0.719
## 7 pct_soil_moisture_7_to_28cm 0.620
## 8 pct_soil_moisture_0_to_7cm 0.594
## 9 pct_vapor_pressure_deficit 0.529
## 10 pct_soil_moisture_28_to_100cm 0.519
## # ℹ 23 more rows
The functions are run recursively for the first three predictors of continuous variable temperature:
yVar <- "temperature_2m"
yCont <- TRUE
varsTemp <- c(varsTrain[!str_detect(varsTrain, "pct_temp|apparent")], "month", "tod", "doy")
varsRun <- 3
rnbpTempList <- vector("list", varsRun)
for(intCtr in 1:varsRun) {
if(intCtr==1) xFix<-character(0)
else xFix<-lapply(rnbpTempList[1:(intCtr-1)], FUN=function(x) x$pred[1]) %>% reduce(.f=c)
rnbpTempList[[intCtr]] <- runNextBestPredictor(varsRun=setdiff(varsTemp, xFix),
xFix=xFix,
yVar=yVar,
isContVar=yCont,
dfTrain=dfTrain,
dfTest=dfTest
) %>%
getNextBestVar(returnTbl = TRUE)
}
##
## R-squared of predictions based on training data applied to holdout dataset is: -0.449% (RMSE 10.33 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 83.691% (RMSE 4.16 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 20.312% (RMSE 9.2 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 19.153% (RMSE 9.26 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 0.114% (RMSE 10.3 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 0.387% (RMSE 10.28 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 2.591% (RMSE 10.17 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 8.543% (RMSE 9.85 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 3.754% (RMSE 10.11 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 14.692% (RMSE 9.52 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 4.674% (RMSE 10.06 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 14.187% (RMSE 9.54 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 12.577% (RMSE 9.63 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 10.195% (RMSE 9.76 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 13.626% (RMSE 9.58 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 4.043% (RMSE 10.09 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 7.263% (RMSE 9.92 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: -11.047% (RMSE 10.86 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: -8.316% (RMSE 10.72 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 3.243% (RMSE 10.13 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 27.494% (RMSE 8.77 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 52.873% (RMSE 7.07 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.134% (RMSE 2.03 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 86.428% (RMSE 3.8 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 71.877% (RMSE 5.46 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 22.374% (RMSE 9.08 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 59.428% (RMSE 6.56 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 61.959% (RMSE 6.35 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 51.879% (RMSE 7.15 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 18.784% (RMSE 9.29 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 76.613% (RMSE 4.98 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 1.729% (RMSE 10.21 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 78.067% (RMSE 4.83 vs. 10.3 null)
## # A tibble: 33 × 2
## pred rfAcc
## <chr> <dbl>
## 1 pct_soil_temperature_0_to_7cm 0.961
## 2 pct_soil_temperature_7_to_28cm 0.864
## 3 pct_dewpoint_2m 0.837
## 4 doy 0.781
## 5 month 0.766
## 6 pct_soil_temperature_28_to_100cm 0.719
## 7 pct_soil_moisture_7_to_28cm 0.620
## 8 pct_soil_moisture_0_to_7cm 0.594
## 9 pct_vapor_pressure_deficit 0.529
## 10 pct_soil_moisture_28_to_100cm 0.519
## 11 pct_et0_fao_evapotranspiration 0.275
## 12 pct_soil_temperature_100_to_255cm 0.224
## 13 pct_pressure_msl 0.203
## 14 pct_surface_pressure 0.192
## 15 pct_soil_moisture_100_to_255cm 0.188
## 16 pct_cloudcover_mid 0.147
## 17 pct_shortwave_radiation 0.142
## 18 pct_diffuse_radiation 0.136
## 19 pct_direct_radiation 0.126
## 20 pct_direct_normal_irradiance 0.102
## 21 pct_cloudcover 0.0854
## 22 pct_windspeed_100m 0.0726
## 23 pct_cloudcover_high 0.0467
## 24 pct_windspeed_10m 0.0404
## 25 pct_cloudcover_low 0.0375
## 26 pct_windgusts_10m 0.0324
## 27 pct_snowfall 0.0259
## 28 tod 0.0173
## 29 pct_rain 0.00387
## 30 pct_precipitation 0.00114
## 31 pct_relativehumidity_2m -0.00449
## 32 pct_winddirection_100m -0.0832
## 33 pct_winddirection_10m -0.110
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.57% (RMSE 1.91 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.079% (RMSE 1.43 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.095% (RMSE 2.04 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.094% (RMSE 2.04 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 85.409% (RMSE 3.94 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 86.161% (RMSE 3.83 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 81.562% (RMSE 4.42 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 95.058% (RMSE 2.29 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 91.435% (RMSE 3.02 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 91.673% (RMSE 2.97 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 90.655% (RMSE 3.15 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 94.179% (RMSE 2.49 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 93.814% (RMSE 2.56 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 93.938% (RMSE 2.54 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 94.506% (RMSE 2.42 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.161% (RMSE 2.02 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.234% (RMSE 2 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 95.451% (RMSE 2.2 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 95.536% (RMSE 2.18 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.256% (RMSE 1.99 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.162% (RMSE 2.02 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.411% (RMSE 1.95 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.436% (RMSE 1.95 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.265% (RMSE 1.99 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 95.896% (RMSE 2.09 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.13% (RMSE 2.03 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.017% (RMSE 2.06 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 95.934% (RMSE 2.08 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 95.866% (RMSE 2.09 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 95.679% (RMSE 2.14 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 84.38% (RMSE 4.07 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.09% (RMSE 2.04 vs. 10.3 null)
## # A tibble: 32 × 2
## pred rfAcc
## <chr> <dbl>
## 1 pct_dewpoint_2m 0.981
## 2 pct_relativehumidity_2m 0.966
## 3 pct_soil_temperature_7_to_28cm 0.964
## 4 pct_vapor_pressure_deficit 0.964
## 5 pct_soil_temperature_28_to_100cm 0.963
## 6 pct_windgusts_10m 0.963
## 7 pct_windspeed_100m 0.962
## 8 pct_et0_fao_evapotranspiration 0.962
## 9 pct_windspeed_10m 0.962
## 10 pct_soil_moisture_0_to_7cm 0.961
## 11 pct_pressure_msl 0.961
## 12 pct_surface_pressure 0.961
## 13 doy 0.961
## 14 pct_soil_moisture_7_to_28cm 0.960
## 15 pct_soil_moisture_28_to_100cm 0.959
## 16 pct_soil_temperature_100_to_255cm 0.959
## 17 pct_soil_moisture_100_to_255cm 0.959
## 18 month 0.957
## 19 pct_winddirection_100m 0.955
## 20 pct_winddirection_10m 0.955
## 21 pct_cloudcover 0.951
## 22 pct_diffuse_radiation 0.945
## 23 pct_shortwave_radiation 0.942
## 24 pct_direct_normal_irradiance 0.939
## 25 pct_direct_radiation 0.938
## 26 pct_cloudcover_mid 0.917
## 27 pct_cloudcover_low 0.914
## 28 pct_cloudcover_high 0.907
## 29 pct_rain 0.862
## 30 pct_precipitation 0.854
## 31 tod 0.844
## 32 pct_snowfall 0.816
##
## R-squared of predictions based on training data applied to holdout dataset is: 99.82% (RMSE 0.44 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.076% (RMSE 1.43 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.072% (RMSE 1.43 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 95.025% (RMSE 2.3 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 94.949% (RMSE 2.32 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 91.891% (RMSE 2.93 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 97.889% (RMSE 1.5 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.293% (RMSE 1.98 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 97.054% (RMSE 1.77 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 96.47% (RMSE 1.94 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.076% (RMSE 1.43 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.054% (RMSE 1.44 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 97.907% (RMSE 1.49 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.102% (RMSE 1.42 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.082% (RMSE 1.43 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.08% (RMSE 1.43 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 97.622% (RMSE 1.59 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 97.629% (RMSE 1.59 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.075% (RMSE 1.43 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.575% (RMSE 1.23 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 99.874% (RMSE 0.37 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.264% (RMSE 1.36 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.268% (RMSE 1.36 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.055% (RMSE 1.44 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.094% (RMSE 1.42 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.101% (RMSE 1.42 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 97.922% (RMSE 1.49 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 97.957% (RMSE 1.47 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.022% (RMSE 1.45 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 93.295% (RMSE 2.67 vs. 10.3 null)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.113% (RMSE 1.42 vs. 10.3 null)
## # A tibble: 31 × 2
## pred rfAcc
## <chr> <dbl>
## 1 pct_vapor_pressure_deficit 0.999
## 2 pct_relativehumidity_2m 0.998
## 3 pct_et0_fao_evapotranspiration 0.986
## 4 pct_soil_temperature_28_to_100cm 0.983
## 5 pct_soil_temperature_7_to_28cm 0.983
## 6 doy 0.981
## 7 pct_diffuse_radiation 0.981
## 8 pct_soil_moisture_7_to_28cm 0.981
## 9 pct_soil_moisture_0_to_7cm 0.981
## 10 pct_windspeed_10m 0.981
## 11 pct_windspeed_100m 0.981
## 12 pct_shortwave_radiation 0.981
## 13 pct_pressure_msl 0.981
## 14 pct_windgusts_10m 0.981
## 15 pct_surface_pressure 0.981
## 16 pct_soil_temperature_100_to_255cm 0.981
## 17 pct_direct_radiation 0.981
## 18 month 0.980
## 19 pct_soil_moisture_100_to_255cm 0.980
## 20 pct_soil_moisture_28_to_100cm 0.979
## 21 pct_direct_normal_irradiance 0.979
## 22 pct_cloudcover 0.979
## 23 pct_winddirection_100m 0.976
## 24 pct_winddirection_10m 0.976
## 25 pct_cloudcover_mid 0.971
## 26 pct_cloudcover_high 0.965
## 27 pct_cloudcover_low 0.963
## 28 pct_precipitation 0.950
## 29 pct_rain 0.949
## 30 tod 0.933
## 31 pct_snowfall 0.919
rnbpTempList
## [[1]]
## # A tibble: 33 × 2
## pred value
## <chr> <dbl>
## 1 pct_soil_temperature_0_to_7cm 0.961
## 2 pct_soil_temperature_7_to_28cm 0.864
## 3 pct_dewpoint_2m 0.837
## 4 doy 0.781
## 5 month 0.766
## 6 pct_soil_temperature_28_to_100cm 0.719
## 7 pct_soil_moisture_7_to_28cm 0.620
## 8 pct_soil_moisture_0_to_7cm 0.594
## 9 pct_vapor_pressure_deficit 0.529
## 10 pct_soil_moisture_28_to_100cm 0.519
## # ℹ 23 more rows
##
## [[2]]
## # A tibble: 32 × 2
## pred value
## <chr> <dbl>
## 1 pct_dewpoint_2m 0.981
## 2 pct_relativehumidity_2m 0.966
## 3 pct_soil_temperature_7_to_28cm 0.964
## 4 pct_vapor_pressure_deficit 0.964
## 5 pct_soil_temperature_28_to_100cm 0.963
## 6 pct_windgusts_10m 0.963
## 7 pct_windspeed_100m 0.962
## 8 pct_et0_fao_evapotranspiration 0.962
## 9 pct_windspeed_10m 0.962
## 10 pct_soil_moisture_0_to_7cm 0.961
## # ℹ 22 more rows
##
## [[3]]
## # A tibble: 31 × 2
## pred value
## <chr> <dbl>
## 1 pct_vapor_pressure_deficit 0.999
## 2 pct_relativehumidity_2m 0.998
## 3 pct_et0_fao_evapotranspiration 0.986
## 4 pct_soil_temperature_28_to_100cm 0.983
## 5 pct_soil_temperature_7_to_28cm 0.983
## 6 doy 0.981
## 7 pct_diffuse_radiation 0.981
## 8 pct_soil_moisture_7_to_28cm 0.981
## 9 pct_soil_moisture_0_to_7cm 0.981
## 10 pct_windspeed_10m 0.981
## # ℹ 21 more rows
Improvements in R-squared are plotted:
rnbpTempList %>%
bind_rows(.id="src") %>%
arrange(src, value) %>%
group_by(src) %>%
mutate(rn=row_number()) %>%
ungroup() %>%
ggplot(aes(x=rn, y=value)) +
geom_line(aes(group=src, color=src)) +
scale_color_discrete("# Preds") +
labs(title="R-squared by next predictor", y="R-squared", x="Next-best sorted from worst to best") +
facet_wrap(~src, scales="free_y") +
lims(y=c(NA, 1)) +
geom_hline(yintercept=1, lty=2)
There are generally a handful of potential next-best predictors that significantly outperform the others
There are combinations of 2 variables that outperform the accuracy of the iterative approach:
# Iterative model
runFullRF(dfTrain=dfTrain,
yVar="temperature_2m",
xVars=c("pct_soil_temperature_0_to_7cm", "pct_dewpoint_2m"),
dfTest=dfTest,
isContVar=TRUE,
useLabel=keyLabel_v3,
useSub=stringr::str_to_sentence(keyLabel_v3),
returnData=FALSE
)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.079% (RMSE 1.43 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'
# Relative humidity and vapor pressure deficit
runFullRF(dfTrain=dfTrain,
yVar="temperature_2m",
xVars=c("pct_relativehumidity_2m", "pct_vapor_pressure_deficit"),
dfTest=dfTest,
isContVar=TRUE,
useLabel=keyLabel_v3,
useSub=stringr::str_to_sentence(keyLabel_v3),
returnData=FALSE
)
##
## R-squared of predictions based on training data applied to holdout dataset is: 98.972% (RMSE 1.04 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'
# Dewpoint and relative humidity
runFullRF(dfTrain=dfTrain,
yVar="temperature_2m",
xVars=c("pct_dewpoint_2m", "pct_relativehumidity_2m"),
dfTest=dfTest,
isContVar=TRUE,
useLabel=keyLabel_v3,
useSub=stringr::str_to_sentence(keyLabel_v3),
returnData=FALSE
)
##
## R-squared of predictions based on training data applied to holdout dataset is: 99.819% (RMSE 0.44 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'
# Dewpoint and vapor pressure deficit
runFullRF(dfTrain=dfTrain,
yVar="temperature_2m",
xVars=c("pct_dewpoint_2m", "pct_vapor_pressure_deficit"),
dfTest=dfTest,
isContVar=TRUE,
useLabel=keyLabel_v3,
useSub=stringr::str_to_sentence(keyLabel_v3),
returnData=FALSE
)
##
## R-squared of predictions based on training data applied to holdout dataset is: 99.927% (RMSE 0.28 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'
All else equal at a given locale, temperature and dewpoint mathematically determine relative humidity and vapor pressure deficit. So, while relative humidity has no standalone predictive power on temperature, in combination with dewpoint, it drives nearly 100% R-squared
The relationships between temperature, dewpoint, and relative humidity are explored:
dfMiniTest <- dfTrain %>%
select(t=temperature_2m, d=dewpoint_2m, rh=relativehumidity_2m) %>%
# mutate(across(.cols=everything(), .fns=function(x) round(x))) %>%
count(t, d, rh) %>%
filter(rh %in% c(30, 50, 70, 90))
dfMiniTest %>%
ggplot(aes(x=d, y=t)) +
geom_point(aes(color=factor(rh), size=n)) +
geom_abline(slope=1, intercept=0, lty=2) +
scale_color_discrete("% RH") +
labs(title="Dewpoint vs. Temperature for Select Relative Humidities", y="Temp", x="Dew Point")
lm(t~factor(rh)+d:factor(rh)+0, data=dfMiniTest) %>% summary()
##
## Call:
## lm(formula = t ~ factor(rh) + d:factor(rh) + 0, data = dfMiniTest)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.34192 -0.06763 -0.00016 0.06874 0.37981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## factor(rh)30 1.786e+01 7.760e-03 2301.5 <2e-16 ***
## factor(rh)50 9.976e+00 3.709e-03 2689.8 <2e-16 ***
## factor(rh)70 5.028e+00 4.115e-03 1222.0 <2e-16 ***
## factor(rh)90 1.459e+00 5.781e-03 252.3 <2e-16 ***
## factor(rh)30:d 1.146e+00 6.366e-04 1800.8 <2e-16 ***
## factor(rh)50:d 1.084e+00 3.242e-04 3345.1 <2e-16 ***
## factor(rh)70:d 1.043e+00 3.268e-04 3190.9 <2e-16 ***
## factor(rh)90:d 1.012e+00 4.318e-04 2344.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09285 on 2203 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 8.839e+06 on 8 and 2203 DF, p-value: < 2.2e-16
All possible combinations of 2 variables are explored on a smaller dataset:
possTempVars <- c(varsTrain[!str_detect(varsTrain, "pct_temp|apparent")], "month", "tod", "doy")
set.seed(24031815)
idxSmall <- sample(1:nrow(dfTrain), 5000, replace=FALSE)
mtxSmall <- matrix(nrow=0, ncol=3)
for(idx1 in 1:(length(possTempVars)-1)) {
for(idx2 in (idx1+1):length(possTempVars)) {
r2Small <- runFullRF(dfTrain=dfTrain[idxSmall,],
yVar="temperature_2m",
xVars=possTempVars[c(idx1, idx2)],
dfTest=dfTest,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=TRUE,
makePlots=FALSE,
returnData=TRUE
)[["rfAcc"]][["r2"]]
mtxSmall <- rbind(mtxSmall, c(idx1, idx2, r2Small))
}
}
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.635% (RMSE 0.62 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.222% (RMSE 9.6 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.967% (RMSE 9.67 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -0.711% (RMSE 10.34 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.171% (RMSE 10.29 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.116% (RMSE 10.19 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.225% (RMSE 10.19 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.163% (RMSE 10.24 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.255% (RMSE 9.76 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -0.484% (RMSE 10.33 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.878% (RMSE 9.45 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.224% (RMSE 9.54 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.373% (RMSE 9.75 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.768% (RMSE 9.46 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -2.647% (RMSE 10.44 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -0.903% (RMSE 10.35 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -27.317% (RMSE 11.63 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -25.096% (RMSE 11.52 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -2.348% (RMSE 10.42 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.533% (RMSE 7.67 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.827% (RMSE 1.12 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 96.298% (RMSE 1.98 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 87.682% (RMSE 3.62 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.492% (RMSE 5.3 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.261% (RMSE 9.37 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 60.588% (RMSE 6.47 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 61.66% (RMSE 6.38 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.494% (RMSE 7.18 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.707% (RMSE 9.68 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.807% (RMSE 4.63 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.652% (RMSE 10.11 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.9% (RMSE 4.62 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.454% (RMSE 4.32 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.532% (RMSE 4.31 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.523% (RMSE 4.31 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.71% (RMSE 4.53 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.445% (RMSE 5.41 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 84.133% (RMSE 4.1 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 81.026% (RMSE 4.49 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.426% (RMSE 4.56 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.097% (RMSE 4.71 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.075% (RMSE 3.25 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.496% (RMSE 3.34 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 87.854% (RMSE 3.59 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 86.999% (RMSE 3.71 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.813% (RMSE 4.27 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.705% (RMSE 4.28 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 81.227% (RMSE 4.46 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.852% (RMSE 4.51 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.015% (RMSE 4.25 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.553% (RMSE 2.4 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.872% (RMSE 0.37 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 97.873% (RMSE 1.5 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 91.479% (RMSE 3.01 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 87.121% (RMSE 3.7 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.7% (RMSE 4.16 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 88.111% (RMSE 3.55 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 86.249% (RMSE 3.82 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.884% (RMSE 4.26 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.325% (RMSE 4.21 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.091% (RMSE 3.4 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 74.335% (RMSE 5.22 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.033% (RMSE 3.41 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 63.555% (RMSE 6.22 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.667% (RMSE 9.29 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.024% (RMSE 9.39 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.431% (RMSE 9.25 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.366% (RMSE 9.31 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.867% (RMSE 9.45 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.516% (RMSE 9.01 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.888% (RMSE 9.39 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.496% (RMSE 8.77 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.172% (RMSE 8.79 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.297% (RMSE 8.96 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.878% (RMSE 8.87 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.082% (RMSE 9.33 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.388% (RMSE 9.08 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -3.619% (RMSE 10.49 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -2.019% (RMSE 10.41 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.984% (RMSE 9.39 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.79% (RMSE 8.13 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.35% (RMSE 6.88 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.809% (RMSE 2.11 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.468% (RMSE 3.93 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.94% (RMSE 5.55 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.626% (RMSE 8.2 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.023% (RMSE 6.35 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 61.446% (RMSE 6.4 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.696% (RMSE 7.01 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.631% (RMSE 8.76 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.045% (RMSE 4.94 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.333% (RMSE 9.31 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.166% (RMSE 4.92 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.214% (RMSE 9.37 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.156% (RMSE 9.43 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.108% (RMSE 9.27 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.019% (RMSE 9.39 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.623% (RMSE 9.52 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.337% (RMSE 9.08 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.853% (RMSE 9.45 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.336% (RMSE 8.84 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.024% (RMSE 8.86 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.615% (RMSE 9.06 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.792% (RMSE 8.93 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.581% (RMSE 9.41 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.198% (RMSE 9.15 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -4.202% (RMSE 10.52 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -2.982% (RMSE 10.46 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.363% (RMSE 9.42 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.976% (RMSE 8.18 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.996% (RMSE 6.91 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.799% (RMSE 2.11 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.315% (RMSE 3.95 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.638% (RMSE 5.58 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.153% (RMSE 8.23 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 61.516% (RMSE 6.39 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 60.985% (RMSE 6.44 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.47% (RMSE 7.03 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.401% (RMSE 8.84 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.936% (RMSE 4.95 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.384% (RMSE 9.36 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.044% (RMSE 4.94 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.025% (RMSE 10.2 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.763% (RMSE 10.16 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.884% (RMSE 9.89 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.977% (RMSE 10.15 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.741% (RMSE 9.57 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.768% (RMSE 10.11 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.185% (RMSE 9.54 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.763% (RMSE 9.62 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.212% (RMSE 9.76 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.175% (RMSE 9.66 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.417% (RMSE 10.13 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.351% (RMSE 9.97 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -8.223% (RMSE 10.72 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -5.703% (RMSE 10.59 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.073% (RMSE 10.14 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.2% (RMSE 8.79 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.969% (RMSE 6.99 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 92.573% (RMSE 2.81 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 84.461% (RMSE 4.06 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.774% (RMSE 5.57 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.011% (RMSE 9.1 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.948% (RMSE 6.68 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.981% (RMSE 6.52 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.211% (RMSE 7.12 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.415% (RMSE 9.42 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.68% (RMSE 5.58 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.442% (RMSE 10.23 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.447% (RMSE 5.31 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.648% (RMSE 10.17 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.071% (RMSE 9.88 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.247% (RMSE 10.13 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.521% (RMSE 9.58 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.136% (RMSE 10.14 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.053% (RMSE 9.55 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.967% (RMSE 9.61 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.19% (RMSE 9.76 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.904% (RMSE 9.67 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.831% (RMSE 10.1 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.837% (RMSE 9.94 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -6.587% (RMSE 10.64 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -4.335% (RMSE 10.52 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.536% (RMSE 10.12 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.653% (RMSE 8.82 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.799% (RMSE 7 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.564% (RMSE 3.33 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.827% (RMSE 4.27 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.172% (RMSE 5.72 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.611% (RMSE 9.12 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.553% (RMSE 6.71 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.037% (RMSE 6.67 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.139% (RMSE 7.2 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.308% (RMSE 9.48 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.259% (RMSE 5.8 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.515% (RMSE 10.22 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.942% (RMSE 5.74 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.241% (RMSE 9.82 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.359% (RMSE 10.02 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.109% (RMSE 9.55 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.562% (RMSE 9.96 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.031% (RMSE 9.55 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.791% (RMSE 9.62 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.499% (RMSE 9.75 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.72% (RMSE 9.51 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.434% (RMSE 10.02 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.439% (RMSE 9.86 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -2.74% (RMSE 10.44 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -1.772% (RMSE 10.39 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.85% (RMSE 10.05 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.951% (RMSE 8.87 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.342% (RMSE 7.55 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 81.999% (RMSE 4.37 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.055% (RMSE 4.94 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.509% (RMSE 6.31 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.325% (RMSE 9.08 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.056% (RMSE 7.21 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.369% (RMSE 7.04 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.187% (RMSE 7.42 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.476% (RMSE 9.42 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.629% (RMSE 6.63 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.939% (RMSE 10.1 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.967% (RMSE 6.6 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.078% (RMSE 9.82 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.148% (RMSE 9.71 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.858% (RMSE 9.89 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.53% (RMSE 9.41 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.229% (RMSE 9.49 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.603% (RMSE 9.63 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.709% (RMSE 9.46 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.913% (RMSE 10.05 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.225% (RMSE 9.92 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -7.968% (RMSE 10.71 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -5.961% (RMSE 10.61 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.682% (RMSE 10.01 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.614% (RMSE 8.7 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.315% (RMSE 6.81 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.027% (RMSE 2.3 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.633% (RMSE 3.91 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.065% (RMSE 5.45 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.534% (RMSE 8.83 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.037% (RMSE 6.59 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 60.667% (RMSE 6.46 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.579% (RMSE 6.94 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.416% (RMSE 9.19 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.47% (RMSE 5.1 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.156% (RMSE 9.87 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.847% (RMSE 5.06 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.161% (RMSE 9.66 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.619% (RMSE 10.01 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.048% (RMSE 9.61 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.971% (RMSE 9.67 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.19% (RMSE 9.76 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.494% (RMSE 9.58 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.596% (RMSE 10.12 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.504% (RMSE 9.96 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -8.426% (RMSE 10.73 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -5.857% (RMSE 10.6 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.198% (RMSE 10.08 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.356% (RMSE 8.84 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.913% (RMSE 7.07 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.172% (RMSE 3.23 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.33% (RMSE 4.33 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.686% (RMSE 5.77 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.444% (RMSE 9.13 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.908% (RMSE 6.84 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.271% (RMSE 6.81 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.66% (RMSE 7.24 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.966% (RMSE 9.5 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 66.393% (RMSE 5.97 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.26% (RMSE 10.08 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.11% (RMSE 5.82 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.071% (RMSE 9.61 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.408% (RMSE 9.02 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.003% (RMSE 9.1 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.985% (RMSE 9.27 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.312% (RMSE 9.14 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.891% (RMSE 9.67 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.21% (RMSE 9.49 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.936% (RMSE 10.15 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.759% (RMSE 10.05 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.64% (RMSE 9.63 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.724% (RMSE 8.39 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.225% (RMSE 6.97 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 91.745% (RMSE 2.96 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.666% (RMSE 4.29 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.823% (RMSE 5.66 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.276% (RMSE 8.54 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.288% (RMSE 6.65 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.326% (RMSE 6.57 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.823% (RMSE 6.92 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.98% (RMSE 8.86 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.653% (RMSE 5.49 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.428% (RMSE 9.59 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.059% (RMSE 5.45 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.418% (RMSE 9.53 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.17% (RMSE 9.6 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.526% (RMSE 9.69 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.368% (RMSE 9.53 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.775% (RMSE 10.16 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.804% (RMSE 10 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -6.143% (RMSE 10.61 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -5.377% (RMSE 10.58 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.879% (RMSE 10.15 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.263% (RMSE 8.73 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.249% (RMSE 7.12 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 91.782% (RMSE 2.95 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 81.51% (RMSE 4.43 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.934% (RMSE 5.74 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.035% (RMSE 8.92 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.747% (RMSE 6.93 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.655% (RMSE 6.78 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.359% (RMSE 7.26 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.206% (RMSE 9.37 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.397% (RMSE 5.79 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.349% (RMSE 10.02 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.322% (RMSE 5.71 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.029% (RMSE 9.5 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.264% (RMSE 8.97 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.621% (RMSE 9.46 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.192% (RMSE 9.49 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.271% (RMSE 9.54 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -1.549% (RMSE 10.38 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.883% (RMSE 10.26 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.073% (RMSE 9.49 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.15% (RMSE 7.56 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.804% (RMSE 6.93 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.646% (RMSE 2.38 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 88.756% (RMSE 3.45 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.952% (RMSE 5.05 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.638% (RMSE 8.07 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 60.686% (RMSE 6.46 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 63.432% (RMSE 6.23 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.718% (RMSE 6.54 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.896% (RMSE 8.81 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.717% (RMSE 4.97 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.007% (RMSE 9.44 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 78.108% (RMSE 4.82 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.121% (RMSE 8.86 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.859% (RMSE 9.45 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.632% (RMSE 9.63 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.974% (RMSE 9.67 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -3.495% (RMSE 10.48 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -1.099% (RMSE 10.36 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.015% (RMSE 9.66 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.241% (RMSE 7.96 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.021% (RMSE 6.91 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.405% (RMSE 2.44 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 88.397% (RMSE 3.51 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.019% (RMSE 5.15 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.568% (RMSE 8.27 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.949% (RMSE 6.52 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 63.157% (RMSE 6.25 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.451% (RMSE 6.64 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.752% (RMSE 8.88 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.057% (RMSE 5.04 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.003% (RMSE 9.55 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.961% (RMSE 4.95 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.877% (RMSE 9.39 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.123% (RMSE 9.82 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.517% (RMSE 9.8 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -3.948% (RMSE 10.5 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -0.931% (RMSE 10.35 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.119% (RMSE 9.82 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.238% (RMSE 8.16 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.221% (RMSE 6.82 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 92.866% (RMSE 2.75 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 86.385% (RMSE 3.8 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.032% (RMSE 5.35 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.825% (RMSE 8.75 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.717% (RMSE 6.7 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.046% (RMSE 6.35 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.099% (RMSE 6.9 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.83% (RMSE 9.05 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 74.244% (RMSE 5.23 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.508% (RMSE 9.75 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.71% (RMSE 5.08 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.402% (RMSE 9.53 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.804% (RMSE 9.57 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.663% (RMSE 10.22 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.326% (RMSE 10.13 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.787% (RMSE 9.57 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.18% (RMSE 8.61 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.443% (RMSE 7.11 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.439% (RMSE 2.43 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 88.567% (RMSE 3.48 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.094% (RMSE 5.04 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.971% (RMSE 8.18 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 61.201% (RMSE 6.42 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.009% (RMSE 6.18 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.951% (RMSE 6.6 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.703% (RMSE 8.88 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.819% (RMSE 4.96 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.892% (RMSE 9.39 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.386% (RMSE 4.9 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.558% (RMSE 10.12 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -19.245% (RMSE 11.25 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -17.4% (RMSE 11.16 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -3.612% (RMSE 10.49 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.028% (RMSE 8.49 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.495% (RMSE 7.03 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.789% (RMSE 2.11 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.139% (RMSE 3.97 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.349% (RMSE 5.7 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.02% (RMSE 9.39 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.663% (RMSE 6.94 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.137% (RMSE 6.75 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.839% (RMSE 7.44 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.433% (RMSE 9.59 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.429% (RMSE 5.11 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.382% (RMSE 10.02 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.346% (RMSE 5.12 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -11.296% (RMSE 10.87 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -9.732% (RMSE 10.79 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.533% (RMSE 10.22 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.47% (RMSE 8.71 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.194% (RMSE 7.2 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.774% (RMSE 2.12 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.317% (RMSE 3.95 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.547% (RMSE 5.69 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.363% (RMSE 9.14 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.663% (RMSE 6.94 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.726% (RMSE 6.86 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.436% (RMSE 7.4 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.361% (RMSE 9.31 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.422% (RMSE 5.11 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.267% (RMSE 9.92 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.772% (RMSE 5.07 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -15.727% (RMSE 11.08 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -21.562% (RMSE 11.36 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.065% (RMSE 9.72 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.036% (RMSE 8.49 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.13% (RMSE 2.27 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.245% (RMSE 4.22 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.271% (RMSE 6.07 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.212% (RMSE 10.19 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.84% (RMSE 7.44 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.282% (RMSE 6.97 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.082% (RMSE 7.84 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.605% (RMSE 10.01 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.956% (RMSE 5.26 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -3.388% (RMSE 10.48 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.197% (RMSE 5.33 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -20.872% (RMSE 11.33 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.774% (RMSE 9.57 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.337% (RMSE 8.47 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.132% (RMSE 2.27 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 84.155% (RMSE 4.1 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.112% (RMSE 5.91 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.285% (RMSE 10.03 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.8% (RMSE 7.44 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.205% (RMSE 6.9 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.708% (RMSE 7.66 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.153% (RMSE 9.98 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 74.363% (RMSE 5.22 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -2.205% (RMSE 10.42 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.722% (RMSE 5.28 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.29% (RMSE 8.54 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.493% (RMSE 7.03 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.891% (RMSE 2.09 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 84.907% (RMSE 4 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.404% (RMSE 5.7 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.703% (RMSE 9.46 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.352% (RMSE 6.88 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.577% (RMSE 6.79 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.921% (RMSE 7.36 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.244% (RMSE 9.49 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.659% (RMSE 5.08 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.064% (RMSE 10.04 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.539% (RMSE 5.1 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 61.668% (RMSE 6.38 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.983% (RMSE 2.07 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.262% (RMSE 3.22 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 78.443% (RMSE 4.78 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.55% (RMSE 7.67 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 66.121% (RMSE 6 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.37% (RMSE 5.7 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.943% (RMSE 6.1 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.591% (RMSE 8.2 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.689% (RMSE 4.53 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.491% (RMSE 8.71 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 81.815% (RMSE 4.39 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 96.022% (RMSE 2.05 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.165% (RMSE 3.23 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.689% (RMSE 4.53 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 60.248% (RMSE 6.5 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.478% (RMSE 5.78 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.402% (RMSE 5.41 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.804% (RMSE 5.57 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.517% (RMSE 7.17 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.483% (RMSE 4.19 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.503% (RMSE 7.39 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 84.28% (RMSE 4.08 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 96.083% (RMSE 2.04 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.985% (RMSE 2.06 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.387% (RMSE 2.21 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.782% (RMSE 2.12 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.619% (RMSE 2.16 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.826% (RMSE 2.11 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.515% (RMSE 2.18 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.401% (RMSE 2.21 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.015% (RMSE 3.99 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.692% (RMSE 2.14 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 87.218% (RMSE 3.68 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.307% (RMSE 3.95 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.847% (RMSE 3.88 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.364% (RMSE 3.94 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.896% (RMSE 3.87 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.111% (RMSE 3.98 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 86.596% (RMSE 3.77 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 78.609% (RMSE 4.77 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 85.008% (RMSE 3.99 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.054% (RMSE 5.15 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.554% (RMSE 5.5 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.806% (RMSE 5.75 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.821% (RMSE 5.57 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.508% (RMSE 5.1 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.884% (RMSE 4.85 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.606% (RMSE 6.04 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.422% (RMSE 5.11 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 63.873% (RMSE 6.19 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.706% (RMSE 6.12 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.532% (RMSE 7.17 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.995% (RMSE 7.57 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.209% (RMSE 4.92 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.659% (RMSE 9.06 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 74.65% (RMSE 5.19 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 63.801% (RMSE 6.2 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.457% (RMSE 6.31 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.202% (RMSE 7.05 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.965% (RMSE 4.94 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.589% (RMSE 7.02 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.154% (RMSE 4.92 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.694% (RMSE 6.62 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.956% (RMSE 6.76 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.277% (RMSE 5.02 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.571% (RMSE 7.02 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.975% (RMSE 5.05 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.455% (RMSE 6.8 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.289% (RMSE 5.02 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.782% (RMSE 7.37 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.78% (RMSE 5.07 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.974% (RMSE 5.05 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.691% (RMSE 9.46 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.898% (RMSE 5.06 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.163% (RMSE 6.58 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 78.001% (RMSE 4.83 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 60.053% (RMSE 6.51 vs. 10.3 null)
Predictive success by metric is explored:
dfSmallR2 <- as.data.frame(mtxSmall) %>%
purrr::set_names(c("idx1", "idx2", "r2")) %>%
tibble::as_tibble() %>%
mutate(var1=possTempVars[idx1], var2=possTempVars[idx2], rn=row_number())
dfSmallR2 %>% arrange(desc(r2)) %>% select(var1, var2, r2) %>% print(n=20)
## # A tibble: 528 × 3
## var1 var2 r2
## <chr> <chr> <dbl>
## 1 pct_dewpoint_2m pct_vapor_pressure_deficit 0.999
## 2 pct_relativehumidity_2m pct_dewpoint_2m 0.996
## 3 pct_relativehumidity_2m pct_vapor_pressure_deficit 0.988
## 4 pct_dewpoint_2m pct_soil_temperature_0_to_7cm 0.979
## 5 pct_relativehumidity_2m pct_soil_temperature_0_to_7cm 0.963
## 6 pct_soil_temperature_0_to_7cm pct_soil_temperature_7_to_28cm 0.961
## 7 pct_vapor_pressure_deficit pct_soil_temperature_0_to_7cm 0.960
## 8 pct_soil_temperature_0_to_7cm pct_soil_temperature_28_to_100cm 0.960
## 9 pct_et0_fao_evapotranspiration pct_soil_temperature_0_to_7cm 0.960
## 10 pct_windgusts_10m pct_soil_temperature_0_to_7cm 0.959
## 11 pct_soil_temperature_0_to_7cm pct_soil_moisture_28_to_100cm 0.958
## 12 pct_pressure_msl pct_soil_temperature_0_to_7cm 0.958
## 13 pct_surface_pressure pct_soil_temperature_0_to_7cm 0.958
## 14 pct_windspeed_10m pct_soil_temperature_0_to_7cm 0.958
## 15 pct_soil_temperature_0_to_7cm pct_soil_moisture_0_to_7cm 0.958
## 16 pct_windspeed_100m pct_soil_temperature_0_to_7cm 0.958
## 17 pct_soil_temperature_0_to_7cm doy 0.957
## 18 pct_soil_temperature_0_to_7cm pct_soil_moisture_7_to_28cm 0.956
## 19 pct_soil_temperature_0_to_7cm pct_soil_moisture_100_to_255cm 0.955
## 20 pct_soil_temperature_0_to_7cm month 0.954
## # ℹ 508 more rows
dfSmallR2 %>%
pivot_longer(cols=c(var1, var2)) %>%
group_by(value) %>%
summarize(across(r2, .fns=list("min"=min, "mu"=mean, "max"=max))) %>%
ggplot(aes(x=fct_reorder(value, r2_mu))) +
coord_flip() +
geom_point(aes(y=r2_mu)) +
geom_errorbar(aes(ymin=r2_min, ymax=r2_max)) +
lims(y=c(NA, 1)) +
geom_hline(yintercept=1, lty=2, color="red") +
labs(title="R-squared in every 2-predictor model including self and one other",
subtitle="Predicting temperature_2m",
y="Range of R2 (min-mean-max)",
x=NULL
)
The variables with the best maximum R-squared are dewpoint, vapor pressure deficit, and relative humidity. Of note, these variables paired with suboptimal second predictors can also produce very poor predictions, such as the negative R-squared for the worst 2-variable model including relative humidity
The best combinations are explored using the full training dataset:
possLargeVars <- dfSmallR2 %>%
arrange(desc(r2)) %>%
filter(r2>=0.975) %>%
select(r2, rn, var1, var2) %>%
pivot_longer(cols=c(var1, var2)) %>%
pull(value) %>%
unique()
possLargeVars
## [1] "pct_dewpoint_2m" "pct_vapor_pressure_deficit"
## [3] "pct_relativehumidity_2m" "pct_soil_temperature_0_to_7cm"
mtxLarge <- matrix(nrow=0, ncol=3)
for(idx1 in 1:(length(possLargeVars)-1)) {
for(idx2 in (idx1+1):length(possLargeVars)) {
r2Large <- runFullRF(dfTrain=dfTrain[,],
yVar="temperature_2m",
xVars=possLargeVars[c(idx1, idx2)],
dfTest=dfTest,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=TRUE,
makePlots=FALSE,
returnData=TRUE
)[["rfAcc"]][["r2"]]
mtxLarge <- rbind(mtxLarge, c(idx1, idx2, r2Large))
}
}
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.925% (RMSE 0.28 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.796% (RMSE 0.47 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.076% (RMSE 1.43 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.993% (RMSE 1.03 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 96.404% (RMSE 1.95 vs. 10.3 null)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 96.569% (RMSE 1.91 vs. 10.3 null)
dfLargeR2 <- as.data.frame(mtxLarge) %>%
purrr::set_names(c("idx1", "idx2", "r2")) %>%
tibble::as_tibble() %>%
mutate(var1=possLargeVars[idx1], var2=possLargeVars[idx2], rn=row_number())
dfLargeR2 %>% arrange(desc(r2)) %>% select(var1, var2, r2) %>% print(n=20)
## # A tibble: 6 × 3
## var1 var2 r2
## <chr> <chr> <dbl>
## 1 pct_dewpoint_2m pct_vapor_pressure_deficit 0.999
## 2 pct_dewpoint_2m pct_relativehumidity_2m 0.998
## 3 pct_vapor_pressure_deficit pct_relativehumidity_2m 0.990
## 4 pct_dewpoint_2m pct_soil_temperature_0_to_7cm 0.981
## 5 pct_relativehumidity_2m pct_soil_temperature_0_to_7cm 0.966
## 6 pct_vapor_pressure_deficit pct_soil_temperature_0_to_7cm 0.964
The process is repeated for categorical variable todSeason, which combines day-night and season:
possTODSVars <- c(varsTrain)
set.seed(24032104)
idxSmallTODS <- sample(1:nrow(dfTrain_v3), 5000, replace=FALSE)
mtxSmallTODS <- matrix(nrow=0, ncol=3)
for(idx1 in 1:(length(possTODSVars)-1)) {
for(idx2 in (idx1+1):length(possTODSVars)) {
r2SmallTODS <- runFullRF(dfTrain=dfTrain_v3[idxSmallTODS,],
yVar="todSeason",
xVars=possTODSVars[c(idx1, idx2)],
dfTest=dfTest_v3,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=FALSE,
makePlots=FALSE,
returnData=TRUE
)[["rfAcc"]]
mtxSmallTODS <- rbind(mtxSmallTODS, c(idx1, idx2, r2SmallTODS))
}
}
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.764%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.232%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.437%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.165%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.856%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.325%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.138%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.494%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.053%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.092%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.455%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.437%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.784%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.044%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.314%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.233%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.549%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.024%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.089%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.37%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.333%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.408%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.027%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.289%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.32%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.46%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.818%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.362%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.23%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.821%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.525%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.858%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.027%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.542%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.729%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.121%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.355%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.373%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.57%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.635%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.656%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.477%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.547%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.77%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.004%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.902%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.485%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.859%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.316%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.55%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.298%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.25%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.214%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.85%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.411%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.596%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.26%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.921%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.256%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.264%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.624%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.401%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.349%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.256%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.173%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.518%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.957%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.856%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.697%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.406%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.378%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.912%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.416%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.527%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.584%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.853%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.638%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.245%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.432%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.676%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.153%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.419%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.878%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.211%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.39%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.281%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.669%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.687%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.835%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.894%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.967%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.687%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.764%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.905%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.905%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.715%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.941%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.988%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.707%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.709%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.595%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.865%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.317%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.585%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.622%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.201%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.809%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.539%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.174%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.336%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.252%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.281%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.86%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.629%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.463%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.939%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.988%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.486%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.394%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.383%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.215%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.887%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.794%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.898%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.009%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.485%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.754%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.352%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.193%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.988%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.186%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.578%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.119%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.129%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.438%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.193%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.713%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.232%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.081%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.949%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.258%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.264%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.983%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.31%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.575%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.635%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.028%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.467%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.233%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.898%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.168%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.093%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.894%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.323%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.697%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.081%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.952%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.326%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.558%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.867%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.373%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.837%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.134%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.671%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.614%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.715%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.24%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.89%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.095%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.451%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.313%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.025%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.838%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.111%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.989%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.963%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.176%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.723%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.012%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.432%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.5%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.532%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.168%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.511%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.633%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.625%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.388%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.817%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.364%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.352%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.463%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.323%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.975%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.134%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.001%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.848%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.978%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.812%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.176%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.682%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.316%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.378%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.245%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.965%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.799%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.204%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.261%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.95%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.259%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.952%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.201%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.863%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.803%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.754%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.37%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.183%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.489%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.414%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.3%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.175%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.139%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.643%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.776%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.363%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.489%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.749%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.638%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.313%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.625%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.495%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.576%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.558%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.906%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.367%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.695%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.065%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.24%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.5%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.697%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.489%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.806%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.31%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.157%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.672%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.139%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.373%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.352%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.173%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.723%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.417%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.017%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.822%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.381%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.708%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.139%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.978%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.471%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.092%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.772%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.931%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.035%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.274%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.554%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.292%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.435%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.43%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.737%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.173%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.274%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.713%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.154%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.064%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.906%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.007%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.054%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.251%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.435%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.622%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.962%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.063%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.978%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.474%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.5%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.069%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.947%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.848%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.934%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.045%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.427%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.791%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.035%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.121%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.402%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.158%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.186%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.29%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.923%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.162%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.541%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.016%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.287%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.689%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.097%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.741%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.731%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.032%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.658%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.648%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.396%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.827%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.503%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.139%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.054%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.568%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.578%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.406%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.162%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.138%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.894%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.715%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.201%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.806%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.386%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.871%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.622%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.032%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.51%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.5%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.471%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.78%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.695%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.993%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.593%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.429%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.712%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 60.496%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 61.945%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.459%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.693%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.595%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.891%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.359%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.363%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.921%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.593%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.227%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.424%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.983%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.796%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.695%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.848%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.346%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.662%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.485%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.223%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.065%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.621%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.694%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.338%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.958%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.528%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.947%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.799%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.451%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.544%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.396%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.697%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.065%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.605%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.363%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.802%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.054%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.088%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.806%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.813%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.584%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.126%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.388%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.274%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.928%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.36%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.203%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.338%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.981%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 61.758%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 61.384%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.062%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.359%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.265%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.906%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.31%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.791%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.165%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.082%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.388%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.993%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.512%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.258%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.64%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.931%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.684%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.375%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.786%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.069%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.54%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.791%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.851%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.687%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.508%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.671%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.053%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.557%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.463%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.638%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.487%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.918%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.928%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.903%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.072%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.573%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.908%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.1%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.398%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.032%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.957%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.983%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.562%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.552%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.097%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.417%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.451%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.768%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.819%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.37%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.601%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.882%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.347%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.142%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.692%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.63%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.097%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.591%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.11%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.707%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.64%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.417%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.17%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.357%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.89%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.5%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.928%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.202%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.839%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.839%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.353%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.938%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.021%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.338%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.426%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.13%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.6%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.029%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.031%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.866%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.437%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.128%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.055%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.434%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.592%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.997%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.85%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.362%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.645%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.161%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.104%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.733%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.492%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.126%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.315%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.393%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.226%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.183%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.321%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.996%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.897%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.954%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.824%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.728%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.754%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.256%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.069%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.128%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.741%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.923%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.726%
Predictive success by metric is explored:
dfSmallR2TODS <- as.data.frame(mtxSmallTODS) %>%
purrr::set_names(c("idx1", "idx2", "r2")) %>%
tibble::as_tibble() %>%
mutate(var1=possTODSVars[idx1], var2=possTODSVars[idx2], rn=row_number())
dfSmallR2TODS %>% arrange(desc(r2)) %>% select(var1, var2, r2) %>% print(n=20)
## # A tibble: 496 × 3
## var1 var2 r2
## <chr> <chr> <dbl>
## 1 pct_diffuse_radiation pct_soil_temperature_100_to_255cm 0.629
## 2 pct_shortwave_radiation pct_soil_temperature_100_to_255cm 0.627
## 3 pct_shortwave_radiation pct_soil_temperature_28_to_100cm 0.625
## 4 pct_shortwave_radiation pct_soil_temperature_7_to_28cm 0.619
## 5 pct_diffuse_radiation pct_soil_temperature_7_to_28cm 0.618
## 6 pct_diffuse_radiation pct_soil_temperature_28_to_100cm 0.614
## 7 pct_shortwave_radiation pct_soil_temperature_0_to_7cm 0.605
## 8 pct_diffuse_radiation pct_soil_temperature_0_to_7cm 0.600
## 9 pct_temperature_2m pct_diffuse_radiation 0.592
## 10 pct_diffuse_radiation pct_soil_moisture_100_to_255cm 0.589
## 11 pct_direct_radiation pct_soil_temperature_7_to_28cm 0.585
## 12 pct_direct_radiation pct_soil_temperature_100_to_255cm 0.581
## 13 pct_temperature_2m pct_shortwave_radiation 0.578
## 14 pct_shortwave_radiation pct_soil_moisture_100_to_255cm 0.574
## 15 pct_direct_normal_irradiance pct_soil_temperature_7_to_28cm 0.574
## 16 pct_apparent_temperature pct_diffuse_radiation 0.573
## 17 pct_direct_radiation pct_soil_temperature_28_to_100cm 0.572
## 18 pct_direct_normal_irradiance pct_soil_temperature_28_to_100cm 0.568
## 19 pct_apparent_temperature pct_shortwave_radiation 0.567
## 20 pct_direct_radiation pct_soil_temperature_0_to_7cm 0.567
## # ℹ 476 more rows
dfSmallR2TODS %>%
pivot_longer(cols=c(var1, var2)) %>%
group_by(value) %>%
summarize(across(r2, .fns=list("min"=min, "mu"=mean, "max"=max))) %>%
ggplot(aes(x=fct_reorder(value, r2_mu))) +
coord_flip() +
geom_point(aes(y=r2_mu)) +
geom_errorbar(aes(ymin=r2_min, ymax=r2_max)) +
lims(y=c(NA, 1)) +
geom_hline(yintercept=1, lty=2, color="red") +
labs(title="R-squared in every 2-predictor model including self and one other",
subtitle="Predicting todSeason",
y="Range of R2 (min-mean-max)",
x=NULL
)
The variables with the best maximum R-squared radiation (day/night) and soil temperature/moisture (season). This is consistent with the previous iterative approach, as there is no standalone 2-variable formula for perfectly predicting both time of day and season
The best combinations are explored using the full training dataset:
possLargeVarsTODS <- dfSmallR2TODS %>%
arrange(desc(r2)) %>%
filter(row_number()<=5) %>%
select(r2, rn, var1, var2) %>%
pivot_longer(cols=c(var1, var2)) %>%
pull(value) %>%
unique()
possLargeVarsTODS
## [1] "pct_diffuse_radiation" "pct_soil_temperature_100_to_255cm"
## [3] "pct_shortwave_radiation" "pct_soil_temperature_28_to_100cm"
## [5] "pct_soil_temperature_7_to_28cm"
mtxLargeTODS <- matrix(nrow=0, ncol=3)
for(idx1 in 1:(length(possLargeVarsTODS)-1)) {
for(idx2 in (idx1+1):length(possLargeVarsTODS)) {
r2LargeTODS <- runFullRF(dfTrain=dfTrain_v3[,],
yVar="todSeason",
xVars=possLargeVarsTODS[c(idx1, idx2)],
dfTest=dfTest_v3,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=FALSE,
makePlots=FALSE,
returnData=TRUE
)[["rfAcc"]]
mtxLargeTODS <- rbind(mtxLargeTODS, c(idx1, idx2, r2LargeTODS))
}
}
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.984%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.427%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 63.067%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.563%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.459%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.377%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.013%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.459%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 63.114%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.554%
dfLargeR2TODS <- as.data.frame(mtxLargeTODS) %>%
purrr::set_names(c("idx1", "idx2", "r2")) %>%
tibble::as_tibble() %>%
mutate(var1=possLargeVarsTODS[idx1], var2=possLargeVarsTODS[idx2], rn=row_number())
dfLargeR2TODS %>% arrange(desc(r2)) %>% select(var1, var2, r2) %>% print(n=20)
## # A tibble: 10 × 3
## var1 var2 r2
## <chr> <chr> <dbl>
## 1 pct_diffuse_radiation pct_soil_temperature_100_to_255cm 0.650
## 2 pct_diffuse_radiation pct_soil_temperature_7_to_28cm 0.646
## 3 pct_shortwave_radiation pct_soil_temperature_7_to_28cm 0.631
## 4 pct_diffuse_radiation pct_soil_temperature_28_to_100cm 0.631
## 5 pct_soil_temperature_100_to_255cm pct_shortwave_radiation 0.625
## 6 pct_shortwave_radiation pct_soil_temperature_28_to_100cm 0.625
## 7 pct_soil_temperature_100_to_255cm pct_soil_temperature_7_to_28cm 0.480
## 8 pct_soil_temperature_100_to_255cm pct_soil_temperature_28_to_100cm 0.464
## 9 pct_soil_temperature_28_to_100cm pct_soil_temperature_7_to_28cm 0.446
## 10 pct_diffuse_radiation pct_shortwave_radiation 0.334
The process is repeated for categorical variable fct_hour, which converts hour to a factor:
possHourVars <- c(varsTrain, "month", "doy")
set.seed(24032415)
idxSmallHour <- sample(1:nrow(dfTrain_v3), 5000, replace=FALSE)
mtxSmallHour <- matrix(nrow=0, ncol=3)
for(idx1 in 1:(length(possHourVars)-1)) {
for(idx2 in (idx1+1):length(possHourVars)) {
r2SmallHour <- runFullRF(dfTrain=dfTrain_v3[idxSmallHour,],
yVar="fct_hour",
xVars=possHourVars[c(idx1, idx2)],
dfTest=dfTest_v3,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=FALSE,
makePlots=FALSE,
returnData=TRUE
)[["rfAcc"]]
mtxSmallHour <- rbind(mtxSmallHour, c(idx1, idx2, r2SmallHour))
}
}
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.498%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.685%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.891%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.815%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.049%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.582%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.301%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.769%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.815%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.236%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.33%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.956%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.007%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.921%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.192%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.259%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.283%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.815%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.862%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.049%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.482%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.171%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.984%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.826%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.75%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.657%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.33%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.114%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.769%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.47%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.704%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.937%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.265%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.311%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.704%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.265%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.106%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.293%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.779%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.779%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.293%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.919%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.545%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.324%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.033%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.267%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.417%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.124%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.171%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.517%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.545%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.124%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.799%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.059%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.592%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.704%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.031%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.732%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.826%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.592%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.779%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.358%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.153%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.293%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.236%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.693%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.114%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.021%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.067%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.049%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.815%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.535%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.974%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.669%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.051%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.631%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.781%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.376%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.189%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.067%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.488%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.192%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.545%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.311%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.348%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.488%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.021%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.769%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.067%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.834%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.647%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.787%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.441%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.535%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.441%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.441%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.254%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.441%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.143%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.956%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.165%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.734%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.566%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.763%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.189%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.517%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.553%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.582%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.862%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.781%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.218%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.61%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.984%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.301%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.535%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.189%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.974%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.722%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.143%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.002%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.002%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.862%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.161%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.067%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.956%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.395%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.283%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.395%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.834%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.716%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.192%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.771%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.389%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.769%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.769%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.348%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.379%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.47%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.283%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.787%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.161%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.88%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.254%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.582%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.067%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.862%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.769%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.096%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.33%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.974%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.389%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.566%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.724%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.921%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.096%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.488%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.348%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.722%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.566%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.704%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.61%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.88%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.441%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.506%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.395%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.067%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.563%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.376%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.722%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.488%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.828%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.631%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.724%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.688%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.909%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.002%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.395%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.657%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.566%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.732%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.423%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.161%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.049%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.693%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.161%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.787%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.722%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.236%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.488%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.348%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.641%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.005%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.051%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.22%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.189%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.517%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.161%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.33%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.285%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.171%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.143%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.348%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.722%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.488%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.067%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.301%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.693%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.787%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.657%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.797%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.535%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.301%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.594%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.145%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.005%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.08%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.815%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.956%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.273%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.834%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.376%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.08%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.405%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.769%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.161%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.6%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.6%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.254%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.74%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.787%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.974%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.647%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.862%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.862%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.704%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.259%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.454%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.753%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.015%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.956%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.441%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.049%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.236%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.864%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.405%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.376%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.517%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.535%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.021%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.096%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.236%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.535%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.909%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.657%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.956%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.745%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.267%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.08%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.202%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.657%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.956%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.189%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.61%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.236%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.033%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.059%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.171%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.049%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.423%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.236%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.815%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.283%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.582%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.236%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.582%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.189%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.582%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.95%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.22%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.893%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.36%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.47%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.283%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.301%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.535%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.189%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.314%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.639%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.956%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.956%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.563%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.395%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.254%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.582%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.956%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.36%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.051%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.724%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.332%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.563%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.47%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.517%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.769%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.163%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.405%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.114%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.974%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.161%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.47%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.254%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.067%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.441%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.553%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.582%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.277%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.334%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.511%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.137%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.342%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.612%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.36%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.436%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.924%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.475%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.428%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.007%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.727%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.194%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.184%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.669%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.529%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.015%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.524%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.318%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.558%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.119%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.818%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.677%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.088%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.602%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.379%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.295%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.903%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.781%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.688%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.267%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.875%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.005%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.94%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.098%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.238%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.353%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.895%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.025%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.742%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.818%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.994%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.836%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.257%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.212%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.454%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.846%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.257%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.051%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.379%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.303%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.742%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.116%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.631%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.968%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.295%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.781%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.846%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.472%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.005%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.986%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.68%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.604%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.727%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.791%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.914%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.95%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.669%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.436%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.202%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.856%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.56%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.7%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.088%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.488%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.974%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.049%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.472%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.031%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.956%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.769%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.582%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.395%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.189%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.348%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.049%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.441%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.779%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.051%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.452%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.563%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.75%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.301%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.441%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.143%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.563%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.535%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.236%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.376%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.143%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.815%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.301%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.257%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.171%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.582%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.956%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.834%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.647%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.722%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.114%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.787%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.535%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.395%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.584%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.376%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.704%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.862%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.301%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.161%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.582%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.6%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.441%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.535%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.441%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.893%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.639%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.797%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.096%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.769%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.096%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.974%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.647%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.441%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.722%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.363%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.407%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.36%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.566%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.893%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.623%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.49%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.116%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.098%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.745%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.353%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.714%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.124%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.527%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.919%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.2%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.545%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.779%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.078%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.545%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.106%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.976%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.517%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.405%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.124%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.75%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.096%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.049%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.013%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.984%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.974%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.395%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.067%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.086%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.395%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.301%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.254%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.582%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.862%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.88%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.647%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.021%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.161%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.834%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.974%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.208%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.6%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.395%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.348%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.834%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.488%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.114%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.909%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.974%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.974%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.834%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.114%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.927%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.535%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.301%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.647%
Predictive success by metric is explored:
dfSmallR2Hour <- as.data.frame(mtxSmallHour) %>%
purrr::set_names(c("idx1", "idx2", "r2")) %>%
tibble::as_tibble() %>%
mutate(var1=possHourVars[idx1], var2=possHourVars[idx2], rn=row_number())
dfSmallR2Hour %>% arrange(desc(r2)) %>% select(var1, var2, r2) %>% print(n=20)
## # A tibble: 561 × 3
## var1 var2 r2
## <chr> <chr> <dbl>
## 1 pct_shortwave_radiation doy 0.213
## 2 pct_shortwave_radiation month 0.205
## 3 pct_diffuse_radiation doy 0.187
## 4 pct_diffuse_radiation month 0.186
## 5 pct_shortwave_radiation pct_et0_fao_evapotranspiration 0.169
## 6 pct_et0_fao_evapotranspiration pct_vapor_pressure_deficit 0.164
## 7 pct_direct_radiation doy 0.159
## 8 pct_shortwave_radiation pct_vapor_pressure_deficit 0.155
## 9 pct_shortwave_radiation pct_soil_temperature_0_to_7cm 0.154
## 10 pct_shortwave_radiation pct_direct_normal_irradiance 0.153
## 11 pct_shortwave_radiation pct_soil_temperature_100_to_255cm 0.152
## 12 pct_temperature_2m pct_shortwave_radiation 0.150
## 13 pct_shortwave_radiation pct_soil_temperature_7_to_28cm 0.150
## 14 pct_diffuse_radiation pct_soil_temperature_28_to_100cm 0.149
## 15 pct_shortwave_radiation pct_soil_temperature_28_to_100cm 0.147
## 16 pct_diffuse_radiation pct_soil_temperature_0_to_7cm 0.147
## 17 pct_diffuse_radiation pct_et0_fao_evapotranspiration 0.147
## 18 pct_direct_radiation month 0.144
## 19 pct_et0_fao_evapotranspiration doy 0.144
## 20 pct_temperature_2m pct_diffuse_radiation 0.143
## # ℹ 541 more rows
dfSmallR2Hour %>%
pivot_longer(cols=c(var1, var2)) %>%
group_by(value) %>%
summarize(across(r2, .fns=list("min"=min, "mu"=mean, "max"=max))) %>%
ggplot(aes(x=fct_reorder(value, r2_mu))) +
coord_flip() +
geom_point(aes(y=r2_mu)) +
geom_errorbar(aes(ymin=r2_min, ymax=r2_max)) +
lims(y=c(NA, 1)) +
geom_hline(yintercept=1, lty=2, color="red") +
labs(title="R-squared in every 2-predictor model including self and one other",
subtitle="Predicting fct_hour",
y="Range of R2 (min-mean-max)",
x=NULL
)
The best combinations are explored using the full training dataset:
possLargeVarsHour <- dfSmallR2Hour %>%
arrange(desc(r2)) %>%
filter(row_number()<=6) %>%
select(r2, rn, var1, var2) %>%
pivot_longer(cols=c(var1, var2)) %>%
pull(value) %>%
unique()
possLargeVarsHour
## [1] "pct_shortwave_radiation" "doy"
## [3] "month" "pct_diffuse_radiation"
## [5] "pct_et0_fao_evapotranspiration" "pct_vapor_pressure_deficit"
mtxLargeHour <- matrix(nrow=0, ncol=3)
for(idx1 in 1:(length(possLargeVarsHour)-1)) {
for(idx2 in (idx1+1):length(possLargeVarsHour)) {
r2LargeHour <- runFullRF(dfTrain=dfTrain_v3[,],
yVar="fct_hour",
xVars=possLargeVarsHour[c(idx1, idx2)],
dfTest=dfTest_v3,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=FALSE,
makePlots=FALSE,
returnData=TRUE
)[["rfAcc"]]
mtxLargeHour <- rbind(mtxLargeHour, c(idx1, idx2, r2LargeHour))
}
}
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.666%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.469%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.298%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.391%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.906%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.974%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.001%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.111%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.948%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.365%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.615%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.322%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.269%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.334%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.017%
dfLargeR2Hour <- as.data.frame(mtxLargeHour) %>%
purrr::set_names(c("idx1", "idx2", "r2")) %>%
tibble::as_tibble() %>%
mutate(var1=possLargeVarsHour[idx1], var2=possLargeVarsHour[idx2], rn=row_number())
dfLargeR2Hour %>% arrange(desc(r2)) %>% select(var1, var2, r2) %>% print(n=20)
## # A tibble: 15 × 3
## var1 var2 r2
## <chr> <chr> <dbl>
## 1 pct_shortwave_radiation doy 0.257
## 2 pct_shortwave_radiation month 0.235
## 3 doy pct_diffuse_radiation 0.230
## 4 month pct_diffuse_radiation 0.214
## 5 pct_shortwave_radiation pct_vapor_pressure_deficit 0.179
## 6 pct_shortwave_radiation pct_et0_fao_evapotranspiration 0.174
## 7 pct_shortwave_radiation pct_diffuse_radiation 0.173
## 8 doy pct_et0_fao_evapotranspiration 0.171
## 9 pct_et0_fao_evapotranspiration pct_vapor_pressure_deficit 0.170
## 10 pct_diffuse_radiation pct_et0_fao_evapotranspiration 0.163
## 11 month pct_et0_fao_evapotranspiration 0.156
## 12 pct_diffuse_radiation pct_vapor_pressure_deficit 0.153
## 13 month pct_vapor_pressure_deficit 0.0832
## 14 doy pct_vapor_pressure_deficit 0.0795
## 15 doy month 0.0397
Relationships among month, hour, and radiation are explored:
dfTrain_v3 %>%
select(month, fct_hour, s=shortwave_radiation) %>%
ggplot(aes(x=fct_hour, y=s)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(title="Shortwave radiation by month and hour", x="Hour", y="Shortwave Radiation")
dfTrain_v3 %>%
select(month, fct_hour, s=shortwave_radiation) %>%
mutate(rndS=round(s/40)*40) %>%
ggplot(aes(x=s)) +
geom_histogram(data=~filter(., s>0), fill="lightblue") +
geom_text(data=~summarize(group_by(., month), s0=sum(s==0)),
aes(x=500, y=500, label=paste0(s0, " obs of 0\nnot plotted")),
size=2.5,
hjust=0
) +
facet_wrap(~month) +
labs(title="Shortwave Radiation Histogram", x="Shortwave Radiation", y=NULL)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
dfTrain_v3 %>%
select(month, fct_hour, s=shortwave_radiation) %>%
mutate(rndS=round(s/40)*40) %>%
ggplot(aes(x=factor(rndS))) +
geom_bar(aes(fill=fct_hour), position="fill") +
facet_wrap(~month) +
labs(title="Shortwave Radiation by Month and Hour", x="Shortwave Radiation", y=NULL)
With no solar radiation at night, there is no predictive power for solar radiation as to specific hour of night. As well, since solar radiation generally follows a sinusoidal trend from dawn to dusk, it has little predictive power as to before solar noon or after solar noon. Solar radiation is generally useful for distinguishing night from day, as well as rough magnitude of time from solar noon (during daylight hours and given month)
Hour is converted to blocks of time for easier interpretation:
tmpHourBucket <- dfTrain_v3 %>%
select(month, fct_hour, s=shortwave_radiation) %>%
mutate(num_hour=as.numeric(as.character(fct_hour)),
bkt_hour=case_when(num_hour<=7 ~ "8pm-8am",
num_hour>=20 ~ "8pm-8am",
num_hour<12 ~ "8am-noon",
num_hour<16 ~ "noon-4pm",
TRUE~"4pm-8pm"
),
bkt_hour=factor(bkt_hour, levels=c("8pm-8am", "8am-noon", "noon-4pm", "4pm-8pm"))
)
tmpHourBucket %>% count(bkt_hour, fct_hour) %>% print(n=30)
## # A tibble: 24 × 3
## bkt_hour fct_hour n
## <fct> <fct> <int>
## 1 8pm-8am 0 3320
## 2 8pm-8am 1 3292
## 3 8pm-8am 2 3295
## 4 8pm-8am 3 3289
## 5 8pm-8am 4 3287
## 6 8pm-8am 5 3282
## 7 8pm-8am 6 3301
## 8 8pm-8am 7 3302
## 9 8pm-8am 20 3317
## 10 8pm-8am 21 3271
## 11 8pm-8am 22 3293
## 12 8pm-8am 23 3262
## 13 8am-noon 8 3245
## 14 8am-noon 9 3310
## 15 8am-noon 10 3240
## 16 8am-noon 11 3263
## 17 noon-4pm 12 3313
## 18 noon-4pm 13 3301
## 19 noon-4pm 14 3273
## 20 noon-4pm 15 3271
## 21 4pm-8pm 16 3258
## 22 4pm-8pm 17 3298
## 23 4pm-8pm 18 3269
## 24 4pm-8pm 19 3301
tmpHourBucket %>%
ggplot(aes(x=bkt_hour, y=s)) +
geom_boxplot(fill="lightblue") +
facet_wrap(~month) +
labs(title="Shortwave radiation by month and hour", x="Hour (bucketed)", y="Shortwave Radiation")
tmpHourBucket %>%
mutate(rndS=round(s/40)*40) %>%
ggplot(aes(x=factor(rndS))) +
geom_bar(aes(fill=bkt_hour), position="fill") +
facet_wrap(~month) +
labs(title="Shortwave Radiation by Month and Hour", x="Shortwave Radiation", y=NULL)
Daily and hourly data are downloaded for Chicago, cached to avoid multiple hits to the server:
# Hourly data download for Chicago, IL
testURLHourly <- helperOpenMeteoURL(cityName="Chicago IL",
hourlyIndices=1:nrow(tblMetricsHourly),
startDate="2010-01-01",
endDate="2023-12-31",
tz="America/Chicago"
)
##
## Hourly metrics created from indices: temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm
testURLHourly
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=41.84&longitude=-87.68&start_date=2010-01-01&end_date=2023-12-31&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm&timezone=America%2FChicago"
# Download file
if(!file.exists("testOM_hourly_chi.json")) {
fileDownload(fileName="testOM_hourly_chi.json", url=testURLHourly)
} else {
cat("\nFile testOM_hourly_chi.json already exists, skipping download\n")
}
##
## File testOM_hourly_chi.json already exists, skipping download
# Daily data download for New York, NY
testURLDaily <- helperOpenMeteoURL(cityName="Chicago IL",
dailyIndices=1:nrow(tblMetricsDaily),
startDate="2010-01-01",
endDate="2023-12-31",
tz="America/Chicago"
)
##
## Daily metrics created from indices: weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
testURLDaily
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=41.84&longitude=-87.68&start_date=2010-01-01&end_date=2023-12-31&daily=weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration&timezone=America%2FChicago"
# Download file
if(!file.exists("testOM_daily_chi.json")) {
fileDownload(fileName="testOM_daily_chi.json", url=testURLDaily)
} else {
cat("\nFile testOM_daily_chi.json already exists, skipping download\n")
}
##
## File testOM_daily_chi.json already exists, skipping download
Core datasets for the new city are loaded, with explanatory variables added for future processing:
# Read daily JSON file
chiOMDaily <- readOpenMeteoJSON("testOM_daily_chi.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, daily_units, daily
chiOMDaily
## $tblDaily
## # A tibble: 5,113 × 18
## date time weathercode temperature_2m_max temperature_2m_min
## <date> <chr> <int> <dbl> <dbl>
## 1 2010-01-01 2010-01-01 3 -8.6 -13.4
## 2 2010-01-02 2010-01-02 2 -10.4 -15.1
## 3 2010-01-03 2010-01-03 3 -7.9 -13.8
## 4 2010-01-04 2010-01-04 3 -6.9 -12.3
## 5 2010-01-05 2010-01-05 3 -4.8 -9.8
## 6 2010-01-06 2010-01-06 71 -4.9 -9
## 7 2010-01-07 2010-01-07 73 -5.2 -8.5
## 8 2010-01-08 2010-01-08 73 -3 -9.4
## 9 2010-01-09 2010-01-09 3 -5.8 -12.3
## 10 2010-01-10 2010-01-10 3 -8.8 -19.4
## # ℹ 5,103 more rows
## # ℹ 13 more variables: apparent_temperature_max <dbl>,
## # apparent_temperature_min <dbl>, precipitation_sum <dbl>, rain_sum <dbl>,
## # snowfall_sum <dbl>, precipitation_hours <dbl>, sunrise <chr>, sunset <chr>,
## # windspeed_10m_max <dbl>, windgusts_10m_max <dbl>,
## # winddirection_10m_dominant <int>, shortwave_radiation_sum <dbl>,
## # et0_fao_evapotranspiration <dbl>
##
## $tblHourly
## NULL
##
## $tblUnits
## # A tibble: 17 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 daily_units time "iso8601" <NA>
## 2 daily_units weathercode "wmo code" The most severe weather co…
## 3 daily_units temperature_2m_max "deg C" Maximum and minimum daily …
## 4 daily_units temperature_2m_min "deg C" Maximum and minimum daily …
## 5 daily_units apparent_temperature_max "deg C" Maximum and minimum daily …
## 6 daily_units apparent_temperature_min "deg C" Maximum and minimum daily …
## 7 daily_units precipitation_sum "mm" Sum of daily precipitation…
## 8 daily_units rain_sum "mm" Sum of daily rain
## 9 daily_units snowfall_sum "cm" Sum of daily snowfall
## 10 daily_units precipitation_hours "h" The number of hours with r…
## 11 daily_units sunrise "iso8601" Sun rise and set times
## 12 daily_units sunset "iso8601" Sun rise and set times
## 13 daily_units windspeed_10m_max "km/h" Maximum wind speed and gus…
## 14 daily_units windgusts_10m_max "km/h" Maximum wind speed and gus…
## 15 daily_units winddirection_10m_dominant "deg " Dominant wind direction
## 16 daily_units shortwave_radiation_sum "MJ/m²" The sum of solar radiaion …
## 17 daily_units et0_fao_evapotranspiration "mm" Daily sum of ET0 Reference…
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seconds timezone
## <dbl> <dbl> <dbl> <int> <chr>
## 1 41.9 -87.6 59.4 -18000 America/Chicago
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
prettyOpenMeteoMeta(chiOMDaily)
##
## latitude: 41.86292
## longitude: -87.64877
## generationtime_ms: 59.38601
## utc_offset_seconds: -18000
## timezone: America/Chicago
## timezone_abbreviation: CDT
## elevation: 180
# Read hourly JSON file
chiOMHourly <- readOpenMeteoJSON("testOM_hourly_chi.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly
chiOMHourly
## $tblDaily
## NULL
##
## $tblHourly
## # A tibble: 122,712 × 37
## time date hour temperature_2m relativehumidity_2m
## <dttm> <date> <int> <dbl> <int>
## 1 2010-01-01 00:00:00 2010-01-01 0 -9.5 67
## 2 2010-01-01 01:00:00 2010-01-01 1 -9.8 69
## 3 2010-01-01 02:00:00 2010-01-01 2 -10.3 73
## 4 2010-01-01 03:00:00 2010-01-01 3 -10.8 74
## 5 2010-01-01 04:00:00 2010-01-01 4 -11.3 75
## 6 2010-01-01 05:00:00 2010-01-01 5 -11.8 76
## 7 2010-01-01 06:00:00 2010-01-01 6 -12.3 77
## 8 2010-01-01 07:00:00 2010-01-01 7 -12.8 78
## 9 2010-01-01 08:00:00 2010-01-01 8 -13.2 79
## 10 2010-01-01 09:00:00 2010-01-01 9 -13.4 78
## # ℹ 122,702 more rows
## # ℹ 32 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## # pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## # rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## # cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## # direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## # diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
##
## $tblUnits
## # A tibble: 34 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 hourly_units time iso8601 <NA>
## 2 hourly_units temperature_2m deg C Air temperature at 2 meters above …
## 3 hourly_units relativehumidity_2m % Relative humidity at 2 meters abov…
## 4 hourly_units dewpoint_2m deg C Dew point temperature at 2 meters …
## 5 hourly_units apparent_temperature deg C Apparent temperature is the percei…
## 6 hourly_units pressure_msl hPa Atmospheric air pressure reduced t…
## 7 hourly_units surface_pressure hPa Atmospheric air pressure reduced t…
## 8 hourly_units precipitation mm Total precipitation (rain, showers…
## 9 hourly_units rain mm Only liquid precipitation of the p…
## 10 hourly_units snowfall cm Snowfall amount of the preceding h…
## # ℹ 24 more rows
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seconds timezone
## <dbl> <dbl> <dbl> <int> <chr>
## 1 41.9 -87.6 4476. -18000 America/Chicago
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
prettyOpenMeteoMeta(chiOMHourly)
##
## latitude: 41.86292
## longitude: -87.64877
## generationtime_ms: 4476.2
## utc_offset_seconds: -18000
## timezone: America/Chicago
## timezone_abbreviation: CDT
## elevation: 180
# Create percentiles for numeric variables
chiTemp <- chiOMHourly$tblHourly %>%
mutate(year=year(date),
month=factor(month.abb[lubridate::month(date)], levels=month.abb),
hour=lubridate::hour(time),
fct_hour=factor(hour),
tod=ifelse(hour>=7 & hour<=18, "Day", "Night"),
doy=yday(date),
season=case_when(month %in% c("Mar", "Apr", "May") ~ "Spring",
month %in% c("Jun", "Jul", "Aug") ~ "Summer",
month %in% c("Sep", "Oct", "Nov") ~ "Fall",
month %in% c("Dec", "Jan", "Feb") ~ "Winter",
TRUE~"typo"
),
todSeason=paste0(season, "-", tod),
tod=factor(tod, levels=c("Day", "Night")),
season=factor(season, levels=c("Spring", "Summer", "Fall", "Winter")),
todSeason=factor(todSeason,
levels=paste0(rep(c("Spring", "Summer", "Fall", "Winter"), each=2),
"-",
c("Day", "Night")
)
),
across(where(is.numeric), .fns=function(x) round(100*percent_rank(x)), .names="pct_{.col}")
)
glimpse(chiTemp)
## Rows: 122,712
## Columns: 80
## $ time <dttm> 2010-01-01 00:00:00, 2010-01-01 01:…
## $ date <date> 2010-01-01, 2010-01-01, 2010-01-01,…
## $ hour <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ temperature_2m <dbl> -9.5, -9.8, -10.3, -10.8, -11.3, -11…
## $ relativehumidity_2m <int> 67, 69, 73, 74, 75, 76, 77, 78, 79, …
## $ dewpoint_2m <dbl> -14.4, -14.4, -14.2, -14.5, -14.8, -…
## $ apparent_temperature <dbl> -15.8, -16.3, -16.8, -17.2, -17.7, -…
## $ pressure_msl <dbl> 1024.4, 1024.7, 1025.3, 1025.8, 1026…
## $ surface_pressure <dbl> 1000.8, 1001.1, 1001.6, 1002.1, 1002…
## $ precipitation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rain <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ snowfall <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cloudcover <int> 62, 47, 20, 15, 15, 19, 25, 22, 22, …
## $ cloudcover_low <int> 69, 52, 22, 17, 17, 21, 28, 25, 25, …
## $ cloudcover_mid <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, …
## $ cloudcover_high <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ shortwave_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 119, …
## $ direct_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 69, 14…
## $ direct_normal_irradiance <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
## $ diffuse_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 50, 7…
## $ windspeed_10m <dbl> 18.7, 20.1, 19.9, 19.5, 19.0, 19.4, …
## $ windspeed_100m <dbl> 25.9, 28.4, 29.2, 29.8, 30.1, 30.0, …
## $ winddirection_10m <int> 298, 291, 290, 289, 289, 288, 287, 2…
## $ winddirection_100m <int> 299, 294, 294, 295, 295, 294, 295, 2…
## $ windgusts_10m <dbl> 33.8, 32.4, 34.2, 33.1, 31.3, 31.7, …
## $ et0_fao_evapotranspiration <dbl> 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, …
## $ weathercode <int> 2, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, …
## $ vapor_pressure_deficit <dbl> 0.10, 0.09, 0.08, 0.07, 0.06, 0.06, …
## $ soil_temperature_0_to_7cm <dbl> -1.5, -1.6, -1.8, -1.9, -2.1, -2.3, …
## $ soil_temperature_7_to_28cm <dbl> -0.4, -0.4, -0.4, -0.4, -0.4, -0.4, …
## $ soil_temperature_28_to_100cm <dbl> 2.4, 2.4, 2.4, 2.4, 2.3, 2.3, 2.3, 2…
## $ soil_temperature_100_to_255cm <dbl> 9.0, 9.0, 9.0, 9.0, 8.9, 8.9, 8.9, 8…
## $ soil_moisture_0_to_7cm <dbl> 0.295, 0.295, 0.294, 0.294, 0.294, 0…
## $ soil_moisture_7_to_28cm <dbl> 0.300, 0.300, 0.300, 0.300, 0.300, 0…
## $ soil_moisture_28_to_100cm <dbl> 0.334, 0.334, 0.334, 0.334, 0.334, 0…
## $ soil_moisture_100_to_255cm <dbl> 0.310, 0.310, 0.310, 0.310, 0.311, 0…
## $ origTime <chr> "2010-01-01T00:00", "2010-01-01T01:0…
## $ year <dbl> 2010, 2010, 2010, 2010, 2010, 2010, …
## $ month <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, J…
## $ fct_hour <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ tod <fct> Night, Night, Night, Night, Night, N…
## $ doy <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ season <fct> Winter, Winter, Winter, Winter, Wint…
## $ todSeason <fct> Winter-Night, Winter-Night, Winter-N…
## $ pct_hour <dbl> 0, 4, 8, 13, 17, 21, 25, 29, 33, 38,…
## $ pct_temperature_2m <dbl> 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, …
## $ pct_relativehumidity_2m <dbl> 33, 37, 46, 48, 50, 52, 55, 57, 59, …
## $ pct_dewpoint_2m <dbl> 4, 4, 5, 4, 4, 4, 4, 4, 3, 3, 3, 4, …
## $ pct_apparent_temperature <dbl> 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, …
## $ pct_pressure_msl <dbl> 84, 85, 86, 88, 89, 89, 90, 91, 91, …
## $ pct_surface_pressure <dbl> 80, 81, 83, 85, 85, 86, 87, 89, 89, …
## $ pct_precipitation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_rain <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_snowfall <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_cloudcover <dbl> 62, 55, 33, 30, 30, 33, 37, 35, 35, …
## $ pct_cloudcover_low <dbl> 77, 74, 66, 64, 64, 66, 68, 67, 67, …
## $ pct_cloudcover_mid <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0,…
## $ pct_cloudcover_high <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_shortwave_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, 63, 7…
## $ pct_direct_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 57, 69, 7…
## $ pct_direct_normal_irradiance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 76, 8…
## $ pct_diffuse_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 51, 59, 6…
## $ pct_windspeed_10m <dbl> 66, 72, 71, 70, 68, 69, 65, 63, 59, …
## $ pct_windspeed_100m <dbl> 59, 67, 69, 71, 72, 72, 67, 63, 61, …
## $ pct_winddirection_10m <dbl> 87, 85, 84, 84, 84, 84, 83, 83, 83, …
## $ pct_winddirection_100m <dbl> 86, 85, 85, 85, 85, 85, 85, 85, 84, …
## $ pct_windgusts_10m <dbl> 69, 65, 70, 67, 62, 63, 63, 61, 59, …
## $ pct_et0_fao_evapotranspiration <dbl> 27, 16, 16, 16, 16, 16, 16, 16, 16, …
## $ pct_weathercode <dbl> 55, 34, 0, 0, 0, 0, 34, 34, 34, 0, 3…
## $ pct_vapor_pressure_deficit <dbl> 17, 15, 12, 10, 7, 7, 5, 5, 5, 5, 5,…
## $ pct_soil_temperature_0_to_7cm <dbl> 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 2, 2, …
## $ pct_soil_temperature_7_to_28cm <dbl> 11, 11, 11, 11, 11, 11, 11, 11, 11, …
## $ pct_soil_temperature_28_to_100cm <dbl> 18, 18, 18, 18, 18, 18, 18, 18, 18, …
## $ pct_soil_temperature_100_to_255cm <dbl> 40, 40, 40, 40, 40, 40, 40, 40, 40, …
## $ pct_soil_moisture_0_to_7cm <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, …
## $ pct_soil_moisture_7_to_28cm <dbl> 84, 84, 84, 84, 84, 84, 84, 84, 84, …
## $ pct_soil_moisture_28_to_100cm <dbl> 99, 99, 99, 99, 99, 99, 99, 98, 98, …
## $ pct_soil_moisture_100_to_255cm <dbl> 85, 85, 85, 85, 86, 86, 86, 86, 86, …
## $ pct_year <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_doy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
chiTemp %>%
count(doy, month) %>%
ggplot(aes(y=doy, x=month)) +
geom_boxplot(aes(weight=n), fill="lightblue") +
labs(title="Observations by day-of-year and month", x=NULL, y="Day of Year")
chiTemp %>%
count(year, month) %>%
ggplot(aes(y=factor(year), x=month)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=3) +
scale_fill_continuous("# Records", low="white", high="green") +
labs(title="Records by year and month", x=NULL, y=NULL)
chiTemp %>% count(todSeason, season, tod)
## # A tibble: 8 × 4
## todSeason season tod n
## <fct> <fct> <fct> <int>
## 1 Spring-Day Spring Day 15456
## 2 Spring-Night Spring Night 15456
## 3 Summer-Day Summer Day 15456
## 4 Summer-Night Summer Night 15456
## 5 Fall-Day Fall Day 15288
## 6 Fall-Night Fall Night 15288
## 7 Winter-Day Winter Day 15156
## 8 Winter-Night Winter Night 15156
chiTemp %>% count(hour, fct_hour, tod) %>% print(n=30)
## # A tibble: 24 × 4
## hour fct_hour tod n
## <int> <fct> <fct> <int>
## 1 0 0 Night 5113
## 2 1 1 Night 5113
## 3 2 2 Night 5113
## 4 3 3 Night 5113
## 5 4 4 Night 5113
## 6 5 5 Night 5113
## 7 6 6 Night 5113
## 8 7 7 Day 5113
## 9 8 8 Day 5113
## 10 9 9 Day 5113
## 11 10 10 Day 5113
## 12 11 11 Day 5113
## 13 12 12 Day 5113
## 14 13 13 Day 5113
## 15 14 14 Day 5113
## 16 15 15 Day 5113
## 17 16 16 Day 5113
## 18 17 17 Day 5113
## 19 18 18 Day 5113
## 20 19 19 Night 5113
## 21 20 20 Night 5113
## 22 21 21 Night 5113
## 23 22 22 Night 5113
## 24 23 23 Night 5113
chiTemp %>% count(month, season)
## # A tibble: 12 × 3
## month season n
## <fct> <fct> <int>
## 1 Jan Winter 10416
## 2 Feb Winter 9480
## 3 Mar Spring 10416
## 4 Apr Spring 10080
## 5 May Spring 10416
## 6 Jun Summer 10080
## 7 Jul Summer 10416
## 8 Aug Summer 10416
## 9 Sep Fall 10080
## 10 Oct Fall 10416
## 11 Nov Fall 10080
## 12 Dec Winter 10416
The existing model (for a different city) for predicting temperature is applied to the new data:
# Predict on new dataset
tstChiPred <- predictRF(rf=rfTemp2m$rf, df=chiTemp)
# Report on accuracy
rfChiAcc <- reportAccuracy(tstChiPred,
trueCol="temperature_2m",
rndReport=2,
useLabel="temperature predictions using previous city model on new city",
reportR2=TRUE,
returnAcc=TRUE
)
##
## R-squared of temperature predictions using previous city model on new city is: 95.8% (RMSE 2.29 vs. 11.18 null)
# Plot confusion data
plotConfusion(tstChiPred,
trueCol="temperature_2m",
useSub="Predictions using previous city model on new city",
plotCont=TRUE,
rndTo=0.5,
refXY=TRUE
)
## `geom_smooth()` using formula = 'y ~ x'
Predictions are reasonably accurate, though meaningful error remains given that a few predictors standalone (e.g., dewpoint and relative humidity) can mathematically predict the dependent variable, temperature
The temperature model is re-run on the first city data, using only the mathematically related predictors:
rfTemp2mBest3 <- runFullRF(dfTrain=dfTrain,
yVar="temperature_2m",
xVars=c("pct_dewpoint_2m", "pct_vapor_pressure_deficit", "pct_relativehumidity_2m"),
dfTest=dfTest,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=TRUE,
rndTo=-1L,
refXY=TRUE,
returnData=TRUE
)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.95% (RMSE 0.23 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'
A function is written to apply a random forest model to new data and report on accuracy of predictions:
newCityPredict <- function(rf,
dfTest,
trueCol,
isContVar=FALSE,
reportR2=isTRUE(isContVar),
plotCont=isTRUE(isContVar),
reportAcc=TRUE,
rndReport=2,
useLabel="requested data",
useTitle=NULL,
useSub=NULL,
rndTo=NULL,
rndBucketsAuto=100,
nSig=NULL,
refXY=FALSE,
returnData=TRUE
) {
# FUNCTION ARGUMENTS:
# rf: The existing "ranger" model OR a list containing element "rf" that has the existing "ranger" model
# dfTest: the new dataset for predictions
# trueCol: column containing true value
# isContVar: boolean, is the variable continuous? (default FALSE means categorical)
# reportR2: boolean, should accuracy be calculated as R-squared?
# (FALSE measures as categorical)
# plotCont: boolean, should plotting assume continuous variables?
# (FALSE assumes confusion plot for categorical variables)
# reportAcc: boolean, should accuracy be reported (printed to output)?
# rndReport: number of significant digits for reporting (will be converted to percentage first)
# useLabel: label for data to be used in reporting
# useTitle: title to be used for chart (NULL means create from trueCol)
# useSub: subtitle to be used for chart (NULL means none)
# rndTo: every number in x should be rounded to the nearest rndTo
# NULL means no rounding (default)
# -1L means make an estimate based on data
# rndBucketsAuto: integer, if rndTo is -1L, about how many buckets are desired for predictions?
# nSig: number of significant digits for automatically calculated rounding parameter
# (NULL means calculate exactly)
# refXY: boolean, should a reference line for y=x be included? (relevant only for continuous)
# returnData: boolean, should a list be returned containing tstPred and rfAcc?
# Get the ranger data
if(!("ranger" %in% class(rf))) {
if(!("rf" %in% names(rf))) {
stop("\nERROR: rf must be of class 'ranger' OR a list with element 'rf' that is of class 'ranger")
}
rf <- rf[["rf"]]
}
if(!("ranger" %in% class(rf)))
stop("\nERROR: rf must be of class 'ranger' OR a list with element 'rf' that is of class 'ranger")
# Predict on new dataset
tstPred <- predictRF(rf=rf, df=dfTest)
# Report on accuracy
rfAcc <- reportAccuracy(tstPred,
trueCol=trueCol,
reportAcc=reportAcc,
rndReport=rndReport,
useLabel=useLabel,
reportR2=reportR2,
returnAcc=TRUE
)
# Plot confusion data
plotConfusion(tstPred,
trueCol=trueCol,
useTitle=useTitle,
useSub=useSub,
plotCont=plotCont,
rndTo=rndTo,
rndBucketsAuto=rndBucketsAuto,
nSig=nSig,
refXY=refXY
)
# Return data if requested
if(isTRUE(returnData)) return(list(tstPred=tstPred, rfAcc=rfAcc))
}
The function is applied to make predictions for the new city:
newPreds <- newCityPredict(rfTemp2mBest3,
dfTest=chiTemp,
trueCol="temperature_2m",
isContVar=TRUE,
rndTo=0.5,
refXY=TRUE
)[["tstPred"]][,c("temperature_2m", "pred")]
##
## R-squared of requested data is: 98.11% (RMSE 1.54 vs. 11.18 null)
## `geom_smooth()` using formula = 'y ~ x'
tmpGG <- newPreds %>%
mutate(act5=round(temperature_2m/5)*5, err=pred-temperature_2m, err2=err**2) %>%
group_by(act5) %>%
summarize(across(.cols=where(is.numeric), .fns=function(x) mean(x)), n=n())
tmpGG
## # A tibble: 14 × 6
## act5 temperature_2m pred err err2 n
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 -30 -29.7 -16.2 13.4 183. 31
## 2 -25 -24.6 -16.9 7.74 62.3 82
## 3 -20 -19.3 -16.1 3.27 13.1 610
## 4 -15 -14.7 -12.4 2.32 6.30 1757
## 5 -10 -9.66 -7.61 2.05 4.56 4027
## 6 -5 -4.66 -2.92 1.74 3.14 8820
## 7 0 0.103 1.66 1.56 2.51 18708
## 8 5 4.94 6.63 1.69 3.02 17446
## 9 10 9.91 11.5 1.61 2.67 15225
## 10 15 15.1 16.4 1.26 1.74 15091
## 11 20 20.1 21.1 0.979 1.19 21224
## 12 25 24.7 25.5 0.839 1.03 14794
## 13 30 29.3 30.2 0.924 1.40 4534
## 14 35 34.0 34.0 0.00954 1.12 363
tmpGG %>%
select(act5, temperature_2m, pred) %>%
pivot_longer(cols=-c(act5)) %>%
ggplot(aes(x=act5, y=value)) +
geom_line(aes(group=name,
color=c("pred"="Predicted Mean", "temperature_2m"="Actual Mean")[name]
)
) +
labs(title="Actual vs. Predicted Temperature Using Old City Model on New City Data",
x="New city actual temperature (rounded to nearest 5)",
y="Average temperature for metric"
) +
scale_color_discrete("Metric") +
geom_abline(slope=1, intercept=0, lty=2)
Temperature predictions are consistently biased high, particularly on the coldest days since the new city experiences low temperatures not seen in the original city training data. Even at warmer temperatures, a slight bias remains such that predictions are generally too high
A basic linear model can drive better temperature predictions:
lmMiniTemp <- dfTrain %>%
select(t=temperature_2m, d=dewpoint_2m, rh=relativehumidity_2m) %>%
lm(t~rh+d+rh:d+1, data=.)
summary(lmMiniTemp)
##
## Call:
## lm(formula = t ~ rh + d + rh:d + 1, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8850 -0.4948 -0.1753 0.3255 11.5218
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.199e+01 1.117e-02 1968.29 <2e-16 ***
## rh -2.355e-01 1.676e-04 -1405.46 <2e-16 ***
## d 1.074e+00 9.007e-04 1192.87 <2e-16 ***
## rh:d -3.242e-04 1.295e-05 -25.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6773 on 78849 degrees of freedom
## Multiple R-squared: 0.9955, Adjusted R-squared: 0.9955
## F-statistic: 5.853e+06 on 3 and 78849 DF, p-value: < 2.2e-16
ggMiniTemp <- predict(lmMiniTemp, newdata=chiTemp %>% select(rh=relativehumidity_2m, d=dewpoint_2m)) %>%
mutate(select(chiTemp, temperature_2m),
pred=.,
err=pred-temperature_2m,
err2=err**2,
rnd5=round(temperature_2m/5)*5
) %>%
group_by(rnd5) %>%
summarize(n=n(), across(.cols=where(is.numeric), .fns=mean))
ggMiniTemp
## # A tibble: 14 × 6
## rnd5 n temperature_2m pred err err2
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -30 31 -29.7 -29.2 0.500 0.301
## 2 -25 82 -24.6 -24.5 0.0965 0.221
## 3 -20 610 -19.3 -19.3 0.0588 0.238
## 4 -15 1757 -14.7 -14.5 0.163 0.285
## 5 -10 4027 -9.66 -9.54 0.125 0.280
## 6 -5 8820 -4.66 -4.58 0.0801 0.278
## 7 0 18708 0.103 0.0319 -0.0706 0.340
## 8 5 17446 4.94 4.94 0.00473 0.317
## 9 10 15225 9.91 10.0 0.0869 0.267
## 10 15 15091 15.1 15.2 0.108 0.258
## 11 20 21224 20.1 20.3 0.166 0.247
## 12 25 14794 24.7 24.9 0.198 0.436
## 13 30 4534 29.3 29.1 -0.221 1.39
## 14 35 363 34.0 32.5 -1.53 5.17
ggMiniTemp %>%
select(rnd5, temperature_2m, pred) %>%
pivot_longer(cols=-c(rnd5)) %>%
ggplot(aes(x=rnd5, y=value)) +
geom_line(aes(group=name,
color=c("pred"="Predicted Mean", "temperature_2m"="Actual Mean")[name]
)
) +
labs(title="Actual vs. Predicted Temperature Using Old City Linear Model on New City Data",
x="New city actual temperature (rounded to nearest 5)",
y="Average temperature for metric"
) +
scale_color_discrete("Metric") +
geom_abline(slope=1, intercept=0, lty=2)
The linear model transfers readily from original city to new city, accurately predicting (given that the true formula is close to linear even outside the training data range) even temperatures it has not been trained on
The random forest is re-run using actual data rather than percentiles for predictors:
rfTemp2mBest3Actual <- runFullRF(dfTrain=dfTrain,
yVar="temperature_2m",
xVars=c("dewpoint_2m", "vapor_pressure_deficit", "relativehumidity_2m"),
dfTest=dfTest,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=TRUE,
rndTo=-1L,
refXY=TRUE,
returnData=TRUE
)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.982% (RMSE 0.14 vs. 10.3 null)
## `geom_smooth()` using formula = 'y ~ x'
The function is applied to make predictions for the new city:
newPredsActual <- newCityPredict(rfTemp2mBest3Actual,
dfTest=chiTemp,
trueCol="temperature_2m",
isContVar=TRUE,
rndTo=0.5,
refXY=TRUE
)[["tstPred"]][,c("temperature_2m", "pred")]
##
## R-squared of requested data is: 99.88% (RMSE 0.38 vs. 11.18 null)
## `geom_smooth()` using formula = 'y ~ x'
tmpGGActual <- newPredsActual %>%
mutate(act5=round(temperature_2m/5)*5, err=pred-temperature_2m, err2=err**2) %>%
group_by(act5) %>%
summarize(across(.cols=where(is.numeric), .fns=function(x) mean(x)), n=n())
tmpGGActual
## # A tibble: 14 × 6
## act5 temperature_2m pred err err2 n
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 -30 -29.7 -18.1 11.6 135. 31
## 2 -25 -24.6 -18.1 6.49 44.7 82
## 3 -20 -19.3 -17.6 1.78 5.81 610
## 4 -15 -14.7 -14.2 0.543 1.14 1757
## 5 -10 -9.66 -9.44 0.218 0.231 4027
## 6 -5 -4.66 -4.53 0.126 0.0658 8820
## 7 0 0.103 0.170 0.0671 0.0470 18708
## 8 5 4.94 4.95 0.0171 0.0210 17446
## 9 10 9.91 9.91 -0.00286 0.0141 15225
## 10 15 15.1 15.1 -0.00301 0.0140 15091
## 11 20 20.1 20.1 -0.0266 0.0259 21224
## 12 25 24.7 24.6 -0.0636 0.0402 14794
## 13 30 29.3 29.2 -0.0969 0.0472 4534
## 14 35 34.0 33.6 -0.431 0.513 363
tmpGGActual %>%
select(act5, temperature_2m, pred) %>%
pivot_longer(cols=-c(act5)) %>%
ggplot(aes(x=act5, y=value)) +
geom_line(aes(group=name,
color=c("pred"="Predicted Mean", "temperature_2m"="Actual Mean")[name]
)
) +
labs(title="Actual vs. Predicted Temperature Using Old City Model on New City Data",
x="New city actual temperature (rounded to nearest 5)",
y="Average temperature for metric"
) +
scale_color_discrete("Metric") +
geom_abline(slope=1, intercept=0, lty=2)
Temperature predictions are generally accurate, with the exception of the coldest days since the new city experiences low temperatures not seen in the original city training data
Errors by modeling technique are compared:
tmpGGActual %>%
bind_rows(tmpGG, rename(ggMiniTemp, act5=rnd5), .id="src") %>%
mutate(src=c("1"="RF\nActual", "2"="RF\nPercentile", "3"="Linear")[src], rmse=sqrt(err2)) %>%
select(src, act5, err, err2, rmse) %>%
pivot_longer(cols=c("err", "err2", "rmse")) %>%
ggplot(aes(x=act5, y=value)) +
geom_line(aes(group=src, color=src)) +
facet_wrap(~name, scales="free_y") +
labs(x="Actual Temperature (rounded to nearest 5)", y=NULL) +
scale_color_discrete("Model")
The linear model is best able to predict temperatures it has not been trained on, while the RF model using raw values as predictors is best able to predict temperatures it has been trained on. The RF model using percentiles as predictors is least accurate, as the meaning of 50%le on dewpoint will vary by city
The function is applied to make predictions for month for the new city, once using the new city’s data and once using the old city’s data:
# Split new city data in to test and train data (3:1 split in favor of test)
idxTrainChi <- sort(sample(1:nrow(chiTemp), size=round(0.75*nrow(chiTemp)), replace=FALSE))
chiTempTrain <- chiTemp[idxTrainChi, ]
chiTempTest <- chiTemp[-idxTrainChi, ]
# Using own data, object not returned
chiPredsMonth <- runFullRF(dfTrain=chiTempTrain %>% filter(year(date) < 2022),
yVar="month",
xVars=varsTrain,
dfTest=chiTempTest %>% filter(year(date)==2022),
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
returnData=TRUE
)[["tstPred"]]
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 84.061%
# Using previous city data
newPredsMonth <- newCityPredict(rfMonth,
dfTest=chiTemp,
trueCol="month",
isContVar=FALSE,
useSub="Previous city percentile model applied to make new city predictions"
)[["tstPred"]][,c("month", "pred")]
##
## Accuracy of requested data is: 70.92%
Month is predicted reasonably when applying the old city model to the new city data, with ~70% accuracy and most errors being +/- 1 month. Predictions are stronger when using new city data as inputs to the model, with ~85% accuracy and almost no prediction errors of 2+ month
Frequency of prediction errors is explored:
tmpAccDF <- newPredsMonth %>%
count(month, pred) %>%
mutate(dist=as.integer(month)-as.integer(pred),
moddist=(dist+5) %% 12 - 5
)
tmpAccDF %>%
count(dist, moddist, wt=n) %>%
mutate(pct=n/sum(n))
## # A tibble: 19 × 4
## dist moddist n pct
## <int> <dbl> <int> <dbl>
## 1 -11 1 1515 0.0123
## 2 -10 2 75 0.000611
## 3 -9 3 272 0.00222
## 4 -8 4 46 0.000375
## 5 -7 5 174 0.00142
## 6 -6 6 225 0.00183
## 7 -5 -5 1 0.00000815
## 8 -3 -3 52 0.000424
## 9 -2 -2 564 0.00460
## 10 -1 -1 19393 0.158
## 11 0 0 87025 0.709
## 12 1 1 11430 0.0931
## 13 2 2 248 0.00202
## 14 3 3 14 0.000114
## 15 7 -5 19 0.000155
## 16 8 -4 203 0.00165
## 17 9 -3 26 0.000212
## 18 10 -2 46 0.000375
## 19 11 -1 1384 0.0113
tmpAccDF %>%
count(moddist, wt=n) %>%
mutate(pct=n/sum(n))
## # A tibble: 12 × 3
## moddist n pct
## <dbl> <int> <dbl>
## 1 -5 20 0.000163
## 2 -4 203 0.00165
## 3 -3 78 0.000636
## 4 -2 610 0.00497
## 5 -1 20777 0.169
## 6 0 87025 0.709
## 7 1 12945 0.105
## 8 2 323 0.00263
## 9 3 286 0.00233
## 10 4 46 0.000375
## 11 5 174 0.00142
## 12 6 225 0.00183
The model more often predicts one month forward rather than one month backwards:
tmpAccDF %>%
group_by(month) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
mutate(plotdist=ifelse(moddist<(-1), -2, ifelse(moddist>1, 2, moddist))) %>%
group_by(plotdist, month) %>%
summarize(pct=sum(pct), .groups="drop") %>%
ggplot(aes(x=factor(plotdist), y=pct)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(pct, 3)), vjust=0, size=2.5) +
lims(y=c(0, 1)) +
facet_wrap(~month) +
labs(x="Actual month minus predicted month", y="% Observations", title="Accuracy by actual month")
tmpAccDF %>%
group_by(pred) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
mutate(plotdist=ifelse(moddist<(-1), -2, ifelse(moddist>1, 2, moddist))) %>%
group_by(plotdist, pred) %>%
summarize(pct=sum(pct), .groups="drop") %>%
ggplot(aes(x=factor(plotdist), y=pct)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(pct, 3)), vjust=0, size=2.5) +
lims(y=c(0, 1)) +
facet_wrap(~pred) +
labs(x="Actual month minus predicted month", y="% Observations", title="Accuracy by predicted month")
A similar approach is explored for predictions using the new city data:
tmpAccDF_v2 <- chiPredsMonth %>%
count(month, pred) %>%
mutate(dist=as.integer(month)-as.integer(pred),
moddist=(dist+5) %% 12 - 5
)
tmpAccDF_v2 %>%
count(dist, moddist, wt=n) %>%
mutate(pct=n/sum(n))
## # A tibble: 5 × 4
## dist moddist n pct
## <int> <dbl> <int> <dbl>
## 1 -11 1 38 0.0175
## 2 -1 -1 104 0.0478
## 3 0 0 1830 0.841
## 4 1 1 190 0.0873
## 5 11 -1 15 0.00689
tmpAccDF_v2 %>%
count(moddist, wt=n) %>%
mutate(pct=n/sum(n))
## # A tibble: 3 × 3
## moddist n pct
## <dbl> <int> <dbl>
## 1 -1 119 0.0547
## 2 0 1830 0.841
## 3 1 228 0.105
tmpAccDF_v2 %>%
group_by(month) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
mutate(plotdist=ifelse(moddist<(-1), -2, ifelse(moddist>1, 2, moddist))) %>%
group_by(plotdist, month) %>%
summarize(pct=sum(pct), .groups="drop") %>%
ggplot(aes(x=factor(plotdist), y=pct)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(pct, 3)), vjust=0, size=2.5) +
lims(y=c(0, 1)) +
facet_wrap(~month) +
labs(x="Actual month minus predicted month", y="% Observations", title="Accuracy by actual month")
tmpAccDF_v2 %>%
group_by(pred) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
mutate(plotdist=ifelse(moddist<(-1), -2, ifelse(moddist>1, 2, moddist))) %>%
group_by(plotdist, pred) %>%
summarize(pct=sum(pct), .groups="drop") %>%
ggplot(aes(x=factor(plotdist), y=pct)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(pct, 3)), vjust=0, size=2.5) +
lims(y=c(0, 1)) +
facet_wrap(~pred) +
labs(x="Actual month minus predicted month", y="% Observations", title="Accuracy by predicted month")
Using previous years of new city data to predict a later year of new city month results in ~85% accuracy
The function is applied to make predictions for day/night-season for the new city, once using the new city’s data and once using the old city’s data:
# Using own data, predictions returned
chiPredsTODS <- runFullRF(dfTrain=chiTempTrain %>% filter(year(date) < 2022),
yVar="todSeason",
xVars=varsTrain,
dfTest=chiTempTest %>% filter(year(date)==2022),
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
returnData=TRUE
)[["tstPred"]]
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.905%
# Using previous city data
newPredsTODS <- newCityPredict(rfTODS,
dfTest=chiTemp,
trueCol="todSeason",
isContVar=FALSE,
useSub="Previous city percentile model applied to make new city predictions"
)[["tstPred"]][,c("todSeason", "pred")]
##
## Accuracy of requested data is: 83.79%
Day/night and season are predicted reasonably when applying the old city model to the new city data, with ~85% accuracy. Predictions are stronger when using new city data as inputs to the model, with ~90% accuracy
Frequency of prediction errors is explored:
tmpAccTODS <- newPredsTODS %>%
count(todSeason, pred) %>%
mutate(dnPred=ifelse(str_detect(pred, pattern="Night"), "Night", "Day"),
dnAct=ifelse(str_detect(todSeason, pattern="Night"), "Night", "Day"),
seasPred=str_extract(pred, pattern="^[A-Za-z]+"),
seasAct=str_extract(todSeason, pattern="^[A-Za-z]+")
)
# Day-night accuracy
tmpAccTODS %>%
count(dnAct, dnPred, wt=n) %>%
mutate(pct=n/sum(n))
## # A tibble: 4 × 4
## dnAct dnPred n pct
## <chr> <chr> <int> <dbl>
## 1 Day Day 56881 0.464
## 2 Day Night 4475 0.0365
## 3 Night Day 2070 0.0169
## 4 Night Night 59286 0.483
# Season accuracy
tmpAccTODS %>%
count(seasAct, seasPred, wt=n) %>%
mutate(pct=n/sum(n)) %>%
ggplot(aes(x=seasPred, y=seasAct)) +
geom_tile(aes(fill=100*pct)) +
geom_text(aes(label=paste0(round(100*pct, 1), "%"))) +
scale_fill_continuous("%", low="white", high="lightgreen") +
labs(x="Predicted",
y="Actual",
title="Season component predicted in todSeason",
subtitle="Old city model applied to new city data"
)
The most common source of error is to predict Spring as Winter or Day as Night
Frequency of prediction errors is explored, using new city data as predictor:
tmpAccTODSNew <- chiPredsTODS %>%
count(todSeason, pred) %>%
mutate(dnPred=ifelse(str_detect(pred, pattern="Night"), "Night", "Day"),
dnAct=ifelse(str_detect(todSeason, pattern="Night"), "Night", "Day"),
seasPred=str_extract(pred, pattern="^[A-Za-z]+"),
seasAct=str_extract(todSeason, pattern="^[A-Za-z]+")
)
# Day-night accuracy
tmpAccTODSNew %>%
count(dnAct, dnPred, wt=n) %>%
mutate(pct=n/sum(n))
## # A tibble: 4 × 4
## dnAct dnPred n pct
## <chr> <chr> <int> <dbl>
## 1 Day Day 1051 0.483
## 2 Day Night 71 0.0326
## 3 Night Day 30 0.0138
## 4 Night Night 1025 0.471
# Season accuracy
tmpAccTODSNew %>%
count(seasAct, seasPred, wt=n) %>%
mutate(pct=n/sum(n)) %>%
ggplot(aes(x=seasPred, y=seasAct)) +
geom_tile(aes(fill=100*pct)) +
geom_text(aes(label=paste0(round(100*pct, 1), "%"))) +
scale_fill_continuous("%", low="white", high="lightgreen") +
labs(x="Predicted",
y="Actual",
title="Season component predicted in todSeason",
subtitle="New city data used for predicting new city holdout year"
)
Season is generally well predicted, with the exception of Spring predicted as Winter and Fall predicted as Summer. Predictions using new city data as input have higher overall accuracy
The function is applied to make predictions for deep soil temperature for the new city, once using the new city’s data and once using the old city’s data:
# Using own data, predictions returned
chiPredsSoil <- runFullRF(dfTrain=chiTempTrain %>% filter(year(date) < 2022),
yVar="soil_temperature_100_to_255cm",
xVars=c(varsTrain[!str_detect(varsTrain, "pct_soil_temp")], "month", "tod", "doy"),
dfTest=chiTempTest %>% filter(year(date)==2022),
useLabel=keyLabel,
rndTo=-1L,
refXY=TRUE,
isContVar=TRUE,
useSub=stringr::str_to_sentence(keyLabel),
returnData=TRUE
)[["tstPred"]][,c("soil_temperature_100_to_255cm", "pred")]
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.293% (RMSE 0.9 vs. 6.89 null)
## `geom_smooth()` using formula = 'y ~ x'
# Using previous city data
newPredsSoil <- newCityPredict(rfSoil255,
dfTest=chiTemp,
trueCol="soil_temperature_100_to_255cm",
isContVar=TRUE,
rndTo=-1L,
refXY=TRUE,
useSub="Previous city percentile model applied to make new city predictions"
)[["tstPred"]][,c("soil_temperature_100_to_255cm", "pred")]
##
## R-squared of requested data is: 84.54% (RMSE 2.63 vs. 6.7 null)
## `geom_smooth()` using formula = 'y ~ x'
Accuracy of new city predictions using old city data model is explored:
tmpGGSoil <- newPredsSoil %>%
mutate(act5=round(soil_temperature_100_to_255cm/2.5)*2.5,
err=pred-soil_temperature_100_to_255cm,
err2=err**2
) %>%
group_by(act5) %>%
summarize(across(.cols=where(is.numeric), .fns=function(x) mean(x)), n=n())
tmpGGSoil
## # A tibble: 10 × 6
## act5 soil_temperature_100_to_255cm pred err err2 n
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 0 0.983 6.28 5.30 29.6 2207
## 2 2.5 2.63 6.43 3.80 17.3 15641
## 3 5 4.85 7.51 2.66 10.3 17933
## 4 7.5 7.47 9.23 1.77 7.17 12856
## 5 10 9.98 11.6 1.67 6.71 10774
## 6 12.5 12.5 13.6 1.10 4.02 10399
## 7 15 15.0 15.6 0.608 2.47 10980
## 8 17.5 17.6 17.5 -0.0747 1.22 13527
## 9 20 20.1 19.1 -1.02 2.09 21207
## 10 22.5 21.7 19.7 -1.98 5.13 7188
tmpGGSoil %>%
select(act5, soil_temperature_100_to_255cm, pred) %>%
pivot_longer(cols=-c(act5)) %>%
ggplot(aes(x=act5, y=value)) +
geom_line(aes(group=name,
color=c("pred"="Predicted Mean",
"soil_temperature_100_to_255cm"="Actual Mean"
)[name]
)
) +
labs(title="Actual vs. Predicted Soil Temperature Using Old City Model on New City Data",
x="New city actual soil temperature (rounded to nearest 2.5)",
y="Average temperature for metric"
) +
scale_color_discrete("Metric") +
geom_abline(slope=1, intercept=0, lty=2)
Accuracy of new city predictions using new city data model is explored:
tmpGGSoilNew <- chiPredsSoil %>%
mutate(act5=round(soil_temperature_100_to_255cm/2.5)*2.5,
err=pred-soil_temperature_100_to_255cm,
err2=err**2
) %>%
group_by(act5) %>%
summarize(across(.cols=where(is.numeric), .fns=function(x) mean(x)), n=n())
tmpGGSoilNew
## # A tibble: 9 × 6
## act5 soil_temperature_100_to_255cm pred err err2 n
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2.5 2.57 3.25 0.682 0.864 420
## 2 5 4.86 4.74 -0.124 0.310 257
## 3 7.5 7.51 6.79 -0.713 1.14 152
## 4 10 9.94 10.0 0.0870 2.62 195
## 5 12.5 12.6 12.6 0.0277 0.142 169
## 6 15 15.0 15.4 0.406 0.894 179
## 7 17.5 17.6 17.0 -0.625 0.837 266
## 8 20 20.2 19.9 -0.260 0.478 449
## 9 22.5 21.4 20.9 -0.446 0.208 90
tmpGGSoilNew %>%
select(act5, soil_temperature_100_to_255cm, pred) %>%
pivot_longer(cols=-c(act5)) %>%
ggplot(aes(x=act5, y=value)) +
geom_line(aes(group=name,
color=c("pred"="Predicted Mean",
"soil_temperature_100_to_255cm"="Actual Mean"
)[name]
)
) +
labs(title="Actual vs. Predicted Soil Temperature Using New City Model on New City Data",
x="New city actual soil temperature (rounded to nearest 2.5)",
y="Average temperature for metric"
) +
scale_color_discrete("Metric") +
geom_abline(slope=1, intercept=0, lty=2)
The previous city soil temperature model is run excluding doy, month, and other variables that will tend to bias towards the previous city’s average soil temperature, and using absolute values rather than percentiles:
rfSoilSmallVars <- rfSoil255$rfImp %>%
arrange(desc(imp)) %>%
head(10) %>%
filter(str_detect(metric, pattern="^pct_")) %>%
pull(metric) %>%
str_replace(pattern="^pct_", replacement="")
rfSoil255Small <- runFullRF(dfTrain=dfTrain,
yVar="soil_temperature_100_to_255cm",
xVars=rfSoilSmallVars,
dfTest=dfTest,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=TRUE,
rndTo=-1L,
refXY=TRUE,
returnData=TRUE
)
##
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.924% (RMSE 2.34 vs. 5.84 null)
## `geom_smooth()` using formula = 'y ~ x'
Without day-of-year, deep soil temperature predictions become much less accurate
The model is then run to predict deep soil temperature in the new city:
# Using previous city data
newPredsSoilSmall <- newCityPredict(rfSoil255Small,
dfTest=chiTemp,
trueCol="soil_temperature_100_to_255cm",
isContVar=TRUE,
rndTo=-1L,
refXY=TRUE,
useSub="Previous city model applied to make new city predictions"
)[["tstPred"]][,c("soil_temperature_100_to_255cm", "pred")]
##
## R-squared of requested data is: -92.82% (RMSE 9.3 vs. 6.7 null)
## `geom_smooth()` using formula = 'y ~ x'
# Data frame of distributions by key variables
disVars <- c("soil_temperature_100_to_255cm",
rfSoil255Small$rfImp %>% arrange(-imp) %>% slice(1:3) %>% pull(metric)
)
map_dfr(.x=disVars,
.f=function(x) {
nycTemp %>%
count(get(x)) %>%
bind_rows(count(chiTemp, get(x)), .id="src") %>%
mutate(src=c("1"="Prev", "2"="New")[src], keyVar=x) %>%
colRenamer(vecRename=c(`get(x)`="value"))
}
) %>%
ggplot(aes(x=src)) +
geom_boxplot(aes(y=value, weight=n), fill="lightblue") +
facet_wrap(~keyVar, scales="free_y") +
labs(title="Distributions of Key Variables in Previous City and New City", x=NULL, y=NULL)
Predictions become completely inaccurate, since distributions of soil temperature are similar between the two cities, but distributions of the key predictors (in particular, soil moisture) are not
Accuracy of new city predictions using updated previous city data model is explored:
tmpGGSoilSmall <- newPredsSoilSmall %>%
mutate(act5=round(soil_temperature_100_to_255cm/2.5)*2.5,
err=pred-soil_temperature_100_to_255cm,
err2=err**2
) %>%
group_by(act5) %>%
summarize(across(.cols=where(is.numeric), .fns=function(x) mean(x)), n=n())
tmpGGSoilSmall
## # A tibble: 10 × 6
## act5 soil_temperature_100_to_255cm pred err err2 n
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 0 0.983 17.8 16.8 283. 2207
## 2 2.5 2.63 17.8 15.2 232. 15641
## 3 5 4.85 18.1 13.2 176. 17933
## 4 7.5 7.47 18.3 10.9 120. 12856
## 5 10 9.98 18.6 8.65 76.8 10774
## 6 12.5 12.5 18.9 6.39 43.4 10399
## 7 15 15.0 19.4 4.34 21.1 10980
## 8 17.5 17.6 19.9 2.31 7.26 13527
## 9 20 20.1 20.4 0.319 1.31 21207
## 10 22.5 21.7 20.7 -0.955 1.29 7188
tmpGGSoilSmall %>%
select(act5, soil_temperature_100_to_255cm, pred) %>%
pivot_longer(cols=-c(act5)) %>%
ggplot(aes(x=act5, y=value)) +
geom_line(aes(group=name,
color=c("pred"="Predicted Mean",
"soil_temperature_100_to_255cm"="Actual Mean"
)[name]
)
) +
labs(title="Actual vs. Predicted Soil Temperature Using Updated Previous City Model on New City Data",
x="New city actual soil temperature (rounded to nearest 2.5)",
y="Average deep soil temperature for metric"
) +
scale_color_discrete("Metric") +
geom_abline(slope=1, intercept=0, lty=2)
Errors by modeling technique are compared:
tmpGGSoil %>%
bind_rows(tmpGGSoilNew, tmpGGSoilSmall, .id="src") %>%
mutate(src=c("1"="Old City RF Model\n",
"2"="New City RF Model\n",
"3"="Old City RF Model\nNo Day/Month\n"
)[src],
rmse=sqrt(err2)
) %>%
select(src, act5, err, err2, rmse) %>%
pivot_longer(cols=c("err", "err2", "rmse")) %>%
ggplot(aes(x=act5, y=value)) +
geom_line(aes(group=src, color=src)) +
facet_wrap(~name, scales="free_y") +
labs(x="Actual Deep Soil Temperature (rounded to nearest 2.5)",
y=NULL,
title="Accuracy of New City Predictions by Model Type"
) +
geom_hline(yintercept=0, lty=2) +
scale_color_discrete("Model")
Daily and hourly data are downloaded for Los Angeles, cached to avoid multiple hits to the server:
# Hourly data download for Los Angeles, CA
testURLHourly <- helperOpenMeteoURL(cityName="Los Angeles CA",
hourlyIndices=1:nrow(tblMetricsHourly),
startDate="2010-01-01",
endDate="2023-12-31",
tz="America/Los_Angeles"
)
##
## Hourly metrics created from indices: temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm
testURLHourly
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=34.11&longitude=-118.41&start_date=2010-01-01&end_date=2023-12-31&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm&timezone=America%2FLos_Angeles"
# Download file
if(!file.exists("testOM_hourly_lax.json")) {
fileDownload(fileName="testOM_hourly_lax.json", url=testURLHourly)
} else {
cat("\nFile testOM_hourly_lax.json already exists, skipping download\n")
}
##
## File testOM_hourly_lax.json already exists, skipping download
# Daily data download for Los Angeles, CA
testURLDaily <- helperOpenMeteoURL(cityName="Los Angeles CA",
dailyIndices=1:nrow(tblMetricsDaily),
startDate="2010-01-01",
endDate="2023-12-31",
tz="America/Los_Angeles"
)
##
## Daily metrics created from indices: weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
testURLDaily
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=34.11&longitude=-118.41&start_date=2010-01-01&end_date=2023-12-31&daily=weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration&timezone=America%2FLos_Angeles"
# Download file
if(!file.exists("testOM_daily_lax.json")) {
fileDownload(fileName="testOM_daily_lax.json", url=testURLDaily)
} else {
cat("\nFile testOM_daily_lax.json already exists, skipping download\n")
}
##
## File testOM_daily_lax.json already exists, skipping download
Core datasets for Los Angeles are loaded, with explanatory variables added for future processing:
# Read daily JSON file
laxOMDaily <- readOpenMeteoJSON("testOM_daily_lax.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, daily_units, daily
laxOMDaily
## $tblDaily
## # A tibble: 5,113 × 18
## date time weathercode temperature_2m_max temperature_2m_min
## <date> <chr> <int> <dbl> <dbl>
## 1 2010-01-01 2010-01-01 2 20.1 4.7
## 2 2010-01-02 2010-01-02 1 23.2 6.7
## 3 2010-01-03 2010-01-03 1 23 6.5
## 4 2010-01-04 2010-01-04 2 22.1 6.5
## 5 2010-01-05 2010-01-05 1 22.9 5
## 6 2010-01-06 2010-01-06 2 23.2 7.7
## 7 2010-01-07 2010-01-07 1 23.3 5.2
## 8 2010-01-08 2010-01-08 1 22.8 8.4
## 9 2010-01-09 2010-01-09 2 21.5 7.2
## 10 2010-01-10 2010-01-10 1 24 7.5
## # ℹ 5,103 more rows
## # ℹ 13 more variables: apparent_temperature_max <dbl>,
## # apparent_temperature_min <dbl>, precipitation_sum <dbl>, rain_sum <dbl>,
## # snowfall_sum <dbl>, precipitation_hours <dbl>, sunrise <chr>, sunset <chr>,
## # windspeed_10m_max <dbl>, windgusts_10m_max <dbl>,
## # winddirection_10m_dominant <int>, shortwave_radiation_sum <dbl>,
## # et0_fao_evapotranspiration <dbl>
##
## $tblHourly
## NULL
##
## $tblUnits
## # A tibble: 17 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 daily_units time "iso8601" <NA>
## 2 daily_units weathercode "wmo code" The most severe weather co…
## 3 daily_units temperature_2m_max "deg C" Maximum and minimum daily …
## 4 daily_units temperature_2m_min "deg C" Maximum and minimum daily …
## 5 daily_units apparent_temperature_max "deg C" Maximum and minimum daily …
## 6 daily_units apparent_temperature_min "deg C" Maximum and minimum daily …
## 7 daily_units precipitation_sum "mm" Sum of daily precipitation…
## 8 daily_units rain_sum "mm" Sum of daily rain
## 9 daily_units snowfall_sum "cm" Sum of daily snowfall
## 10 daily_units precipitation_hours "h" The number of hours with r…
## 11 daily_units sunrise "iso8601" Sun rise and set times
## 12 daily_units sunset "iso8601" Sun rise and set times
## 13 daily_units windspeed_10m_max "km/h" Maximum wind speed and gus…
## 14 daily_units windgusts_10m_max "km/h" Maximum wind speed and gus…
## 15 daily_units winddirection_10m_dominant "deg " Dominant wind direction
## 16 daily_units shortwave_radiation_sum "MJ/m²" The sum of solar radiaion …
## 17 daily_units et0_fao_evapotranspiration "mm" Daily sum of ET0 Reference…
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seconds timezone
## <dbl> <dbl> <dbl> <int> <chr>
## 1 34.1 -118. 58.9 -25200 America/Los_Angeles
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
prettyOpenMeteoMeta(laxOMDaily)
##
## latitude: 34.13005
## longitude: -118.4981
## generationtime_ms: 58.85398
## utc_offset_seconds: -25200
## timezone: America/Los_Angeles
## timezone_abbreviation: PDT
## elevation: 333
# Read hourly JSON file
laxOMHourly <- readOpenMeteoJSON("testOM_hourly_lax.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly
laxOMHourly
## $tblDaily
## NULL
##
## $tblHourly
## # A tibble: 122,712 × 37
## time date hour temperature_2m relativehumidity_2m
## <dttm> <date> <int> <dbl> <int>
## 1 2010-01-01 00:00:00 2010-01-01 0 6.3 60
## 2 2010-01-01 01:00:00 2010-01-01 1 5.7 62
## 3 2010-01-01 02:00:00 2010-01-01 2 5.3 63
## 4 2010-01-01 03:00:00 2010-01-01 3 5 64
## 5 2010-01-01 04:00:00 2010-01-01 4 4.8 64
## 6 2010-01-01 05:00:00 2010-01-01 5 4.7 64
## 7 2010-01-01 06:00:00 2010-01-01 6 4.7 64
## 8 2010-01-01 07:00:00 2010-01-01 7 4.8 64
## 9 2010-01-01 08:00:00 2010-01-01 8 5.2 64
## 10 2010-01-01 09:00:00 2010-01-01 9 6.3 63
## # ℹ 122,702 more rows
## # ℹ 32 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## # pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## # rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## # cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## # direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## # diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
##
## $tblUnits
## # A tibble: 34 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 hourly_units time iso8601 <NA>
## 2 hourly_units temperature_2m deg C Air temperature at 2 meters above …
## 3 hourly_units relativehumidity_2m % Relative humidity at 2 meters abov…
## 4 hourly_units dewpoint_2m deg C Dew point temperature at 2 meters …
## 5 hourly_units apparent_temperature deg C Apparent temperature is the percei…
## 6 hourly_units pressure_msl hPa Atmospheric air pressure reduced t…
## 7 hourly_units surface_pressure hPa Atmospheric air pressure reduced t…
## 8 hourly_units precipitation mm Total precipitation (rain, showers…
## 9 hourly_units rain mm Only liquid precipitation of the p…
## 10 hourly_units snowfall cm Snowfall amount of the preceding h…
## # ℹ 24 more rows
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seconds timezone
## <dbl> <dbl> <dbl> <int> <chr>
## 1 34.1 -118. 6196. -25200 America/Los_Angeles
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
prettyOpenMeteoMeta(laxOMHourly)
##
## latitude: 34.13005
## longitude: -118.4981
## generationtime_ms: 6196.377
## utc_offset_seconds: -25200
## timezone: America/Los_Angeles
## timezone_abbreviation: PDT
## elevation: 333
# Create percentiles for numeric variables
laxTemp <- laxOMHourly$tblHourly %>%
mutate(year=year(date),
month=factor(month.abb[lubridate::month(date)], levels=month.abb),
hour=lubridate::hour(time),
fct_hour=factor(hour),
tod=ifelse(hour>=7 & hour<=18, "Day", "Night"),
doy=yday(date),
season=case_when(month %in% c("Mar", "Apr", "May") ~ "Spring",
month %in% c("Jun", "Jul", "Aug") ~ "Summer",
month %in% c("Sep", "Oct", "Nov") ~ "Fall",
month %in% c("Dec", "Jan", "Feb") ~ "Winter",
TRUE~"typo"
),
todSeason=paste0(season, "-", tod),
tod=factor(tod, levels=c("Day", "Night")),
season=factor(season, levels=c("Spring", "Summer", "Fall", "Winter")),
todSeason=factor(todSeason,
levels=paste0(rep(c("Spring", "Summer", "Fall", "Winter"), each=2),
"-",
c("Day", "Night")
)
),
across(where(is.numeric), .fns=function(x) round(100*percent_rank(x)), .names="pct_{.col}")
)
glimpse(laxTemp)
## Rows: 122,712
## Columns: 80
## $ time <dttm> 2010-01-01 00:00:00, 2010-01-01 01:…
## $ date <date> 2010-01-01, 2010-01-01, 2010-01-01,…
## $ hour <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ temperature_2m <dbl> 6.3, 5.7, 5.3, 5.0, 4.8, 4.7, 4.7, 4…
## $ relativehumidity_2m <int> 60, 62, 63, 64, 64, 64, 64, 64, 64, …
## $ dewpoint_2m <dbl> -0.9, -1.0, -1.2, -1.3, -1.4, -1.4, …
## $ apparent_temperature <dbl> 2.9, 2.3, 1.8, 1.3, 1.0, 0.9, 0.9, 1…
## $ pressure_msl <dbl> 1026.5, 1026.1, 1025.7, 1025.7, 1024…
## $ surface_pressure <dbl> 985.7, 985.2, 984.8, 984.7, 983.9, 9…
## $ precipitation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rain <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ snowfall <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cloudcover <int> 14, 21, 23, 29, 31, 30, 29, 30, 31, …
## $ cloudcover_low <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cloudcover_mid <int> 0, 0, 0, 0, 1, 0, 0, 0, 2, 3, 2, 6, …
## $ cloudcover_high <int> 48, 71, 78, 95, 100, 99, 98, 99, 100…
## $ shortwave_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 142, …
## $ direct_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 27, 16…
## $ direct_normal_irradiance <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
## $ diffuse_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 31, 115, …
## $ windspeed_10m <dbl> 7.4, 7.8, 8.0, 9.7, 9.7, 10.1, 10.0,…
## $ windspeed_100m <dbl> 10.4, 10.6, 11.0, 14.9, 14.8, 14.6, …
## $ winddirection_10m <int> 14, 13, 10, 15, 15, 17, 15, 13, 13, …
## $ winddirection_100m <int> 20, 24, 19, 20, 18, 20, 18, 18, 16, …
## $ windgusts_10m <dbl> 19.1, 19.1, 19.4, 19.8, 20.9, 21.6, …
## $ et0_fao_evapotranspiration <dbl> 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, …
## $ weathercode <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ vapor_pressure_deficit <dbl> 0.38, 0.35, 0.33, 0.31, 0.31, 0.31, …
## $ soil_temperature_0_to_7cm <dbl> 7.0, 6.6, 6.2, 5.8, 5.6, 5.4, 5.3, 5…
## $ soil_temperature_7_to_28cm <dbl> 10.8, 10.6, 10.3, 10.1, 9.9, 9.7, 9.…
## $ soil_temperature_28_to_100cm <dbl> 12.9, 12.9, 12.9, 12.9, 12.9, 12.9, …
## $ soil_temperature_100_to_255cm <dbl> 20.5, 20.5, 20.5, 20.5, 20.5, 20.5, …
## $ soil_moisture_0_to_7cm <dbl> 0.205, 0.205, 0.205, 0.205, 0.205, 0…
## $ soil_moisture_7_to_28cm <dbl> 0.251, 0.251, 0.251, 0.250, 0.250, 0…
## $ soil_moisture_28_to_100cm <dbl> 0.168, 0.168, 0.168, 0.168, 0.168, 0…
## $ soil_moisture_100_to_255cm <dbl> 0.165, 0.165, 0.165, 0.165, 0.165, 0…
## $ origTime <chr> "2010-01-01T00:00", "2010-01-01T01:0…
## $ year <dbl> 2010, 2010, 2010, 2010, 2010, 2010, …
## $ month <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, J…
## $ fct_hour <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ tod <fct> Night, Night, Night, Night, Night, N…
## $ doy <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ season <fct> Winter, Winter, Winter, Winter, Wint…
## $ todSeason <fct> Winter-Night, Winter-Night, Winter-N…
## $ pct_hour <dbl> 0, 4, 8, 13, 17, 21, 25, 29, 33, 38,…
## $ pct_temperature_2m <dbl> 4, 3, 3, 2, 2, 2, 2, 2, 3, 4, 12, 34…
## $ pct_relativehumidity_2m <dbl> 52, 54, 55, 57, 57, 57, 57, 57, 57, …
## $ pct_dewpoint_2m <dbl> 15, 15, 15, 14, 14, 14, 14, 14, 15, …
## $ pct_apparent_temperature <dbl> 4, 3, 3, 2, 2, 2, 2, 2, 2, 4, 10, 28…
## $ pct_pressure_msl <dbl> 100, 100, 99, 99, 99, 99, 98, 98, 98…
## $ pct_surface_pressure <dbl> 99, 99, 99, 99, 98, 98, 97, 97, 97, …
## $ pct_precipitation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_rain <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_snowfall <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_cloudcover <dbl> 58, 63, 65, 71, 75, 73, 71, 73, 75, …
## $ pct_cloudcover_low <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_cloudcover_mid <dbl> 0, 0, 0, 0, 76, 0, 0, 0, 78, 80, 78,…
## $ pct_cloudcover_high <dbl> 80, 84, 85, 91, 96, 95, 94, 95, 96, …
## $ pct_shortwave_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, 59, 6…
## $ pct_direct_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 49, 55, 6…
## $ pct_direct_normal_irradiance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 49, 54, 6…
## $ pct_diffuse_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 86, 9…
## $ pct_windspeed_10m <dbl> 61, 64, 65, 77, 77, 79, 79, 79, 79, …
## $ pct_windspeed_100m <dbl> 60, 61, 63, 81, 80, 80, 79, 79, 78, …
## $ pct_winddirection_10m <dbl> 6, 5, 3, 7, 7, 8, 7, 5, 5, 5, 7, 9, …
## $ pct_winddirection_100m <dbl> 8, 10, 8, 8, 7, 8, 7, 7, 6, 4, 4, 4,…
## $ pct_windgusts_10m <dbl> 51, 51, 52, 53, 56, 58, 58, 59, 58, …
## $ pct_et0_fao_evapotranspiration <dbl> 34, 34, 34, 34, 34, 34, 34, 34, 34, …
## $ pct_weathercode <dbl> 0, 63, 63, 63, 63, 63, 63, 63, 63, 6…
## $ pct_vapor_pressure_deficit <dbl> 31, 29, 28, 27, 27, 27, 26, 27, 27, …
## $ pct_soil_temperature_0_to_7cm <dbl> 3, 3, 2, 2, 2, 1, 1, 1, 1, 2, 5, 15,…
## $ pct_soil_temperature_7_to_28cm <dbl> 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 2, …
## $ pct_soil_temperature_28_to_100cm <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
## $ pct_soil_temperature_100_to_255cm <dbl> 64, 64, 64, 64, 64, 64, 64, 64, 64, …
## $ pct_soil_moisture_0_to_7cm <dbl> 83, 83, 83, 83, 83, 83, 83, 83, 83, …
## $ pct_soil_moisture_7_to_28cm <dbl> 87, 87, 87, 87, 87, 87, 87, 87, 87, …
## $ pct_soil_moisture_28_to_100cm <dbl> 56, 56, 56, 56, 56, 56, 56, 56, 56, …
## $ pct_soil_moisture_100_to_255cm <dbl> 34, 34, 34, 34, 34, 34, 34, 34, 34, …
## $ pct_year <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_doy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
laxTemp %>%
count(doy, month) %>%
ggplot(aes(y=doy, x=month)) +
geom_boxplot(aes(weight=n), fill="lightblue") +
labs(title="Observations by day-of-year and month", x=NULL, y="Day of Year")
laxTemp %>%
count(year, month) %>%
ggplot(aes(y=factor(year), x=month)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=3) +
scale_fill_continuous("# Records", low="white", high="green") +
labs(title="Records by year and month", x=NULL, y=NULL)
laxTemp %>% count(todSeason, season, tod)
## # A tibble: 8 × 4
## todSeason season tod n
## <fct> <fct> <fct> <int>
## 1 Spring-Day Spring Day 15456
## 2 Spring-Night Spring Night 15456
## 3 Summer-Day Summer Day 15456
## 4 Summer-Night Summer Night 15456
## 5 Fall-Day Fall Day 15288
## 6 Fall-Night Fall Night 15288
## 7 Winter-Day Winter Day 15156
## 8 Winter-Night Winter Night 15156
laxTemp %>% count(hour, fct_hour, tod) %>% print(n=30)
## # A tibble: 24 × 4
## hour fct_hour tod n
## <int> <fct> <fct> <int>
## 1 0 0 Night 5113
## 2 1 1 Night 5113
## 3 2 2 Night 5113
## 4 3 3 Night 5113
## 5 4 4 Night 5113
## 6 5 5 Night 5113
## 7 6 6 Night 5113
## 8 7 7 Day 5113
## 9 8 8 Day 5113
## 10 9 9 Day 5113
## 11 10 10 Day 5113
## 12 11 11 Day 5113
## 13 12 12 Day 5113
## 14 13 13 Day 5113
## 15 14 14 Day 5113
## 16 15 15 Day 5113
## 17 16 16 Day 5113
## 18 17 17 Day 5113
## 19 18 18 Day 5113
## 20 19 19 Night 5113
## 21 20 20 Night 5113
## 22 21 21 Night 5113
## 23 22 22 Night 5113
## 24 23 23 Night 5113
laxTemp %>% count(month, season)
## # A tibble: 12 × 3
## month season n
## <fct> <fct> <int>
## 1 Jan Winter 10416
## 2 Feb Winter 9480
## 3 Mar Spring 10416
## 4 Apr Spring 10080
## 5 May Spring 10416
## 6 Jun Summer 10080
## 7 Jul Summer 10416
## 8 Aug Summer 10416
## 9 Sep Fall 10080
## 10 Oct Fall 10416
## 11 Nov Fall 10080
## 12 Dec Winter 10416
Distributions of several key variables are explored:
keyVars <- c('temperature_2m',
'relativehumidity_2m',
'dewpoint_2m',
'shortwave_radiation',
'vapor_pressure_deficit',
'soil_temperature_28_to_100cm',
'soil_temperature_100_to_255cm',
'soil_moisture_28_to_100cm',
'soil_moisture_100_to_255cm'
)
laxTemp %>%
colSelector(vecSelect=keyVars) %>%
bind_rows(colSelector(chiTemp, vecSelect=keyVars),
colSelector(nycTemp, vecSelect=keyVars),
.id="src"
) %>%
mutate(cty=c("1"="LA", "2"="Chicago", "3"="NYC")[src]) %>%
pivot_longer(cols=-c(src, cty)) %>%
ggplot(aes(x=cty, y=value)) +
geom_boxplot(aes(fill=cty)) +
facet_wrap(~name, scales="free_y") +
labs(x=NULL, y=NULL, title="Distribution of Key Metrics by City") +
scale_fill_discrete(NULL)
The scatter of temperature and dewpoint is also explored:
laxTemp %>%
bind_rows(nycTemp, chiTemp, .id="src") %>%
select(t=temperature_2m, d=dewpoint_2m, src) %>%
mutate(across(.cols=where(is.numeric), .fns=function(x) round(x))) %>%
count(src, t, d) %>%
ggplot(aes(x=t, y=d)) +
geom_point(aes(size=n, color=c("1"="LA", "2"="NYC", "3"="Chicago")[src]), alpha=0.5) +
geom_smooth(aes(color=c("1"="LA", "2"="NYC", "3"="Chicago")[src], weight=n), method="lm") +
labs(x="Temperature (C)", y="Dewpoint (C)", title="Temperature vs. Dewpoint", subtitle="Hourly") +
scale_color_discrete(NULL) +
scale_size_continuous("# Obs")
## `geom_smooth()` using formula = 'y ~ x'
laxTemp %>%
bind_rows(nycTemp, chiTemp, .id="src") %>%
mutate(src=c("1"="LA", "2"="NYC", "3"="Chicago")[src]) %>%
group_by(src) %>%
summarize(cor_td=cor(temperature_2m, dewpoint_2m))
## # A tibble: 3 × 2
## src cor_td
## <chr> <dbl>
## 1 Chicago 0.950
## 2 LA 0.273
## 3 NYC 0.919
Where NYC and Chicago have a largely linear relationship between temperature and dewpoint, LA has much lower correlation over a much smaller range of (especially) temperatures
The linear approximation for estimating temperature based on dewpoint and relative humidity is applied:
ggMiniTempLA <- predict(lmMiniTemp, newdata=laxTemp %>% select(rh=relativehumidity_2m, d=dewpoint_2m)) %>%
mutate(select(laxTemp, temperature_2m),
pred=.,
err=pred-temperature_2m,
err2=err**2,
rnd5=round(temperature_2m/5)*5
) %>%
group_by(rnd5) %>%
summarize(n=n(), across(.cols=where(is.numeric), .fns=mean))
ggMiniTempLA
## # A tibble: 10 × 6
## rnd5 n temperature_2m pred err err2
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 832 1.34 1.37 0.0337 0.300
## 2 5 7677 5.68 5.39 -0.285 1.08
## 3 10 26309 10.3 9.74 -0.550 2.53
## 4 15 32805 15.0 14.3 -0.706 5.74
## 5 20 26486 19.8 18.5 -1.38 14.0
## 6 25 16364 24.8 21.7 -3.12 32.5
## 7 30 9339 29.6 24.3 -5.38 54.1
## 8 35 2620 34.3 25.8 -8.47 98.7
## 9 40 259 38.8 25.9 -12.9 190.
## 10 45 21 43.4 24.3 -19.1 372.
ggMiniTempLA %>%
select(rnd5, temperature_2m, pred) %>%
pivot_longer(cols=-c(rnd5)) %>%
ggplot(aes(x=rnd5, y=value)) +
geom_line(aes(group=name,
color=c("pred"="Predicted Mean", "temperature_2m"="Actual Mean")[name]
)
) +
labs(title="Actual vs. Predicted Temperature Using Old City Linear Model on New City Data",
x="New city actual temperature (rounded to nearest 5)",
y="Average temperature for metric"
) +
scale_color_discrete("Metric") +
geom_abline(slope=1, intercept=0, lty=2)
The linear approximation based on dewpoint and relative humidity in NYC is inaccurate for predicting temperatures in LA, particularly temperatures above 20C (where LA often has low dewpoints while NYC does not)
The original city random forest model is applied to make predictions for the third city:
newPredsActualLA <- newCityPredict(rfTemp2mBest3Actual,
dfTest=laxTemp,
trueCol="temperature_2m",
isContVar=TRUE,
rndTo=0.5,
refXY=TRUE
)[["tstPred"]][,c("temperature_2m", "pred")]
##
## R-squared of requested data is: 97.42% (RMSE 1.16 vs. 7.21 null)
## `geom_smooth()` using formula = 'y ~ x'
tmpGGActualLA <- newPredsActualLA %>%
mutate(act5=round(temperature_2m/5)*5, err=pred-temperature_2m, err2=err**2) %>%
group_by(act5) %>%
summarize(across(.cols=where(is.numeric), .fns=function(x) mean(x)), n=n())
tmpGGActualLA
## # A tibble: 10 × 6
## act5 temperature_2m pred err err2 n
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 0 1.34 1.38 0.0433 0.0318 832
## 2 5 5.68 5.70 0.0231 0.0377 7677
## 3 10 10.3 10.3 0.0116 0.0421 26309
## 4 15 15.0 15.0 -0.0272 0.0405 32805
## 5 20 19.8 19.7 -0.109 0.219 26486
## 6 25 24.8 24.3 -0.490 2.24 16364
## 7 30 29.6 28.6 -1.03 6.43 9339
## 8 35 34.3 32.2 -2.11 16.3 2620
## 9 40 38.8 32.9 -5.91 50.9 259
## 10 45 43.4 31.3 -12.1 152. 21
tmpGGActualLA %>%
select(act5, temperature_2m, pred) %>%
pivot_longer(cols=-c(act5)) %>%
ggplot(aes(x=act5, y=value)) +
geom_line(aes(group=name,
color=c("pred"="Predicted Mean", "temperature_2m"="Actual Mean")[name]
)
) +
labs(title="Actual vs. Predicted Temperature Using Old City Model on Third City Data",
x="Third city actual temperature (rounded to nearest 5)",
y="Average temperature for metric"
) +
scale_color_discrete("Metric") +
geom_abline(slope=1, intercept=0, lty=2)
Predictions are improved for temperatures in the original city’s range. Random forest cannot predict temperatures not in the original city training data, leading to inaccuracy for third city temperature above 30-35 degrees C.
The linear relationship among temperature and dewpoint for a given relative humidity is further explored:
# Example plots
laxTemp %>%
bind_rows(nycTemp, chiTemp, .id="src") %>%
select(r=relativehumidity_2m, d=dewpoint_2m, t=temperature_2m, src) %>%
filter(r %in% c(15, 25, 35, 45, 55, 65, 75, 85, 95)) %>%
ggplot(aes(x=d, y=t)) +
geom_point(alpha=0.25, aes(color=c("1"="LA", "2"="NYC", "3"="Chicago")[src])) +
facet_wrap(~r) +
geom_abline(intercept = 0, slope=1, lty=2) +
geom_smooth(method="lm") +
scale_color_discrete(NULL) +
labs(x="Dewpoint", y="Temperature", title="Temperature vs. Dewpoint given Relative Humidity")
## `geom_smooth()` using formula = 'y ~ x'
# Formula for linear relationship
lmTempControlRH <- laxTemp %>%
bind_rows(nycTemp, .id="src") %>%
select(r=relativehumidity_2m, d=dewpoint_2m, t=temperature_2m, src) %>% filter() %>%
mutate(r=factor(r)) %>%
lm(t~r+d:r+0, data=.)
lmTempControlRHCoef <- lmTempControlRH %>%
coef() %>%
as.data.frame() %>%
purrr::set_names("coef") %>%
rownames_to_column("var") %>%
tibble::as_tibble() %>%
mutate(rh=as.integer(str_extract(var, pattern="\\d+")),
metric=ifelse(str_detect(var, pattern=":"), "slope", "intercept")
)
lmTempControlRHCoef %>%
ggplot(aes(x=rh, y=coef)) +
geom_point() +
facet_wrap(~metric, scales="free_y") +
labs(x="Relative Humidity",
y=NULL,
title="Linear fit for Temperature = Intercept + Slope*Dewpoint",
subtitle="Calculated for each relative humidity (rounded to the nearest percent)"
)
The function is applied to make predictions for month for the new city, once using the new city’s data and once using the old city’s data:
# Split new city data in to test and train data (3:1 split in favor of test)
set.seed(24042914)
idxTrainLA <- sort(sample(1:nrow(laxTemp), size=round(0.75*nrow(laxTemp)), replace=FALSE))
laxTempTrain <- laxTemp[idxTrainLA, ]
laxTempTest <- laxTemp[-idxTrainLA, ]
# Using own data, object not returned
laxPredsMonth <- runFullRF(dfTrain=laxTempTrain %>% filter(year(date) < 2022),
yVar="month",
xVars=varsTrain,
dfTest=laxTempTest %>% filter(year(date)==2022),
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
returnData=TRUE
)[["tstPred"]]
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.738%
# Using previous city data
newPredsMonthLA <- newCityPredict(rfMonth,
dfTest=laxTemp,
trueCol="month",
isContVar=FALSE,
useSub="Previous city percentile model applied to make new city predictions"
)[["tstPred"]][,c("month", "pred")]
##
## Accuracy of requested data is: 33.66%
Month is predicted with ~35% accuracy and most errors being +/- 2 months when applying the old city model to the new city data. Predictions are stronger when using new city data as inputs to the model, with ~50% accuracy and almost no prediction errors of 2+ month
Frequency of prediction errors is explored:
tmpAccDFLA <- newPredsMonthLA %>%
count(month, pred) %>%
mutate(dist=as.integer(month)-as.integer(pred),
moddist=(dist+5) %% 12 - 5
)
tmpAccDFLA %>%
count(dist, moddist, wt=n) %>%
mutate(pct=n/sum(n))
## # A tibble: 21 × 4
## dist moddist n pct
## <int> <dbl> <int> <dbl>
## 1 -11 1 2715 0.0221
## 2 -10 2 1680 0.0137
## 3 -9 3 878 0.00715
## 4 -8 4 39 0.000318
## 5 -7 5 268 0.00218
## 6 -6 6 259 0.00211
## 7 -5 -5 28 0.000228
## 8 -4 -4 122 0.000994
## 9 -3 -3 655 0.00534
## 10 -2 -2 5442 0.0443
## # ℹ 11 more rows
tmpAccDFLA %>%
count(moddist, wt=n) %>%
mutate(pct=n/sum(n))
## # A tibble: 12 × 3
## moddist n pct
## <dbl> <int> <dbl>
## 1 -5 95 0.000774
## 2 -4 124 0.00101
## 3 -3 655 0.00534
## 4 -2 5442 0.0443
## 5 -1 23264 0.190
## 6 0 41308 0.337
## 7 1 34833 0.284
## 8 2 10800 0.0880
## 9 3 4164 0.0339
## 10 4 1331 0.0108
## 11 5 375 0.00306
## 12 6 321 0.00262
The model more often predicts one month forward rather than one month backwards:
tmpAccDFLA %>%
group_by(month) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
mutate(plotdist=ifelse(moddist<(-1), -2, ifelse(moddist>1, 2, moddist))) %>%
group_by(plotdist, month) %>%
summarize(pct=sum(pct), .groups="drop") %>%
ggplot(aes(x=factor(plotdist), y=pct)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(pct, 3)), vjust=0, size=2.5) +
lims(y=c(0, 1)) +
facet_wrap(~month) +
labs(x="Actual month minus predicted month", y="% Observations", title="Accuracy by actual month")
tmpAccDFLA %>%
group_by(pred) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
mutate(plotdist=ifelse(moddist<(-1), -2, ifelse(moddist>1, 2, moddist))) %>%
group_by(plotdist, pred) %>%
summarize(pct=sum(pct), .groups="drop") %>%
ggplot(aes(x=factor(plotdist), y=pct)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(pct, 3)), vjust=0, size=2.5) +
lims(y=c(0, 1)) +
facet_wrap(~pred) +
labs(x="Actual month minus predicted month", y="% Observations", title="Accuracy by predicted month")
A similar approach is explored for predictions using the new city data:
tmpAccDF_v2LA <- laxPredsMonth %>%
count(month, pred) %>%
mutate(dist=as.integer(month)-as.integer(pred),
moddist=(dist+5) %% 12 - 5
)
tmpAccDF_v2LA %>%
count(dist, moddist, wt=n) %>%
mutate(pct=n/sum(n))
## # A tibble: 7 × 4
## dist moddist n pct
## <int> <dbl> <int> <dbl>
## 1 -11 1 4 0.00179
## 2 -2 -2 98 0.0439
## 3 -1 -1 928 0.416
## 4 0 0 1066 0.477
## 5 1 1 62 0.0278
## 6 2 2 26 0.0116
## 7 11 -1 49 0.0219
tmpAccDF_v2LA %>%
count(moddist, wt=n) %>%
mutate(pct=n/sum(n))
## # A tibble: 5 × 3
## moddist n pct
## <dbl> <int> <dbl>
## 1 -2 98 0.0439
## 2 -1 977 0.438
## 3 0 1066 0.477
## 4 1 66 0.0296
## 5 2 26 0.0116
tmpAccDF_v2LA %>%
group_by(month) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
mutate(plotdist=ifelse(moddist<(-1), -2, ifelse(moddist>1, 2, moddist))) %>%
group_by(plotdist, month) %>%
summarize(pct=sum(pct), .groups="drop") %>%
ggplot(aes(x=factor(plotdist), y=pct)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(pct, 3)), vjust=0, size=2.5) +
lims(y=c(0, 1)) +
facet_wrap(~month) +
labs(x="Actual month minus predicted month", y="% Observations", title="Accuracy by actual month")
tmpAccDF_v2LA %>%
group_by(pred) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
mutate(plotdist=ifelse(moddist<(-1), -2, ifelse(moddist>1, 2, moddist))) %>%
group_by(plotdist, pred) %>%
summarize(pct=sum(pct), .groups="drop") %>%
ggplot(aes(x=factor(plotdist), y=pct)) +
geom_col(fill="lightblue") +
geom_text(aes(label=round(pct, 3)), vjust=0, size=2.5) +
lims(y=c(0, 1)) +
facet_wrap(~pred) +
labs(x="Actual month minus predicted month", y="% Observations", title="Accuracy by predicted month")
Predictions are made for day/night-season for the third city, once using the third city’s data and once using the old city’s data:
# Using own data, predictions returned
laxPredsTODS <- runFullRF(dfTrain=laxTempTrain %>% filter(year(date) < 2022),
yVar="todSeason",
xVars=varsTrain,
dfTest=laxTempTest %>% filter(year(date)==2022),
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
returnData=TRUE
)[["tstPred"]]
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.564%
# Using previous city data
newPredsTODSLA <- newCityPredict(rfTODS,
dfTest=laxTemp,
trueCol="todSeason",
isContVar=FALSE,
useSub="Previous city percentile model applied to make new city predictions"
)[["tstPred"]][,c("todSeason", "pred")]
##
## Accuracy of requested data is: 64.02%
Day/night and season are predicted reasonably when applying the old city model to the new city data, with ~65% accuracy. Predictions are stronger when using new city data as inputs to the model, with ~85% accuracy
Frequency of prediction errors is explored:
tmpAccTODSLA <- newPredsTODSLA %>%
count(todSeason, pred) %>%
mutate(dnPred=ifelse(str_detect(pred, pattern="Night"), "Night", "Day"),
dnAct=ifelse(str_detect(todSeason, pattern="Night"), "Night", "Day"),
seasPred=str_extract(pred, pattern="^[A-Za-z]+"),
seasAct=str_extract(todSeason, pattern="^[A-Za-z]+")
)
# Day-night accuracy
tmpAccTODSLA %>%
count(dnAct, dnPred, wt=n) %>%
mutate(pct=n/sum(n))
## # A tibble: 4 × 4
## dnAct dnPred n pct
## <chr> <chr> <int> <dbl>
## 1 Day Day 55870 0.455
## 2 Day Night 5486 0.0447
## 3 Night Day 2689 0.0219
## 4 Night Night 58667 0.478
# Season accuracy
tmpAccTODSLA %>%
count(seasAct, seasPred, wt=n) %>%
mutate(pct=n/sum(n)) %>%
ggplot(aes(x=seasPred, y=seasAct)) +
geom_tile(aes(fill=100*pct)) +
geom_text(aes(label=paste0(round(100*pct, 1), "%"))) +
scale_fill_continuous("%", low="white", high="lightgreen") +
labs(x="Predicted",
y="Actual",
title="Season component predicted in todSeason",
subtitle="Old city model applied to new city data"
)
The most common source of error is inaccuracy as to season
Frequency of prediction errors is explored, using new city data as predictor:
tmpAccTODSNewLA <- laxPredsTODS %>%
count(todSeason, pred) %>%
mutate(dnPred=ifelse(str_detect(pred, pattern="Night"), "Night", "Day"),
dnAct=ifelse(str_detect(todSeason, pattern="Night"), "Night", "Day"),
seasPred=str_extract(pred, pattern="^[A-Za-z]+"),
seasAct=str_extract(todSeason, pattern="^[A-Za-z]+")
)
# Day-night accuracy
tmpAccTODSNewLA %>%
count(dnAct, dnPred, wt=n) %>%
mutate(pct=n/sum(n))
## # A tibble: 4 × 4
## dnAct dnPred n pct
## <chr> <chr> <int> <dbl>
## 1 Day Day 1029 0.461
## 2 Day Night 86 0.0385
## 3 Night Day 5 0.00224
## 4 Night Night 1113 0.498
# Season accuracy
tmpAccTODSNewLA %>%
count(seasAct, seasPred, wt=n) %>%
mutate(pct=n/sum(n)) %>%
ggplot(aes(x=seasPred, y=seasAct)) +
geom_tile(aes(fill=100*pct)) +
geom_text(aes(label=paste0(round(100*pct, 1), "%"))) +
scale_fill_continuous("%", low="white", high="lightgreen") +
labs(x="Predicted",
y="Actual",
title="Season component predicted in todSeason",
subtitle="New city data used for predicting new city holdout year"
)
Season is generally well predicted, with the exception of Winter predicted as Spring and Fall predicted as Winter. Predictions using third city data as input have higher overall accuracy
The training data for the three cities is integrated:
allCityTrain <- nycTempTrain %>%
select(-starts_with("pct")) %>%
bind_rows(chiTempTrain, laxTempTrain, .id="src") %>%
mutate(src=c("1"="NYC", "2"="Chicago", "3"="LA")[src])
allCityTrain
## # A tibble: 272,520 × 81
## src time date hour temperature_2m relativehumidity_2m
## <chr> <dttm> <date> <int> <dbl> <int>
## 1 NYC 2010-01-01 00:00:00 2010-01-01 0 -1.1 95
## 2 NYC 2010-01-01 01:00:00 2010-01-01 1 -1 96
## 3 NYC 2010-01-01 02:00:00 2010-01-01 2 -1 96
## 4 NYC 2010-01-01 03:00:00 2010-01-01 3 -0.8 97
## 5 NYC 2010-01-01 04:00:00 2010-01-01 4 -0.9 97
## 6 NYC 2010-01-01 06:00:00 2010-01-01 6 -0.7 97
## 7 NYC 2010-01-01 08:00:00 2010-01-01 8 -0.6 97
## 8 NYC 2010-01-01 09:00:00 2010-01-01 9 -0.6 97
## 9 NYC 2010-01-01 10:00:00 2010-01-01 10 0.2 93
## 10 NYC 2010-01-01 11:00:00 2010-01-01 11 1 89
## # ℹ 272,510 more rows
## # ℹ 75 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## # pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## # rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## # cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## # direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## # diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
allCityTrain %>% count(year, src) %>% pivot_wider(id_cols="year", names_from="src", values_from="n")
## # A tibble: 14 × 4
## year Chicago LA NYC
## <dbl> <int> <int> <int>
## 1 2010 6491 6539 6543
## 2 2011 6571 6567 6603
## 3 2012 6517 6576 6599
## 4 2013 6533 6538 6520
## 5 2014 6503 6673 6591
## 6 2015 6613 6564 6598
## 7 2016 6664 6600 6588
## 8 2017 6544 6632 6561
## 9 2018 6510 6618 6553
## 10 2019 6711 6594 6513
## 11 2020 6602 6517 6613
## 12 2021 6586 6540 6571
## 13 2022 6583 6527 6621
## 14 2023 6606 6549 2978
allCityTest <- nycTempTest %>%
select(-starts_with("pct")) %>%
bind_rows(chiTempTest, laxTempTest, .id="src") %>%
mutate(src=c("1"="NYC", "2"="Chicago", "3"="LA")[src])
allCityTest
## # A tibble: 90,840 × 81
## src time date hour temperature_2m relativehumidity_2m
## <chr> <dttm> <date> <int> <dbl> <int>
## 1 NYC 2010-01-01 05:00:00 2010-01-01 5 -0.8 97
## 2 NYC 2010-01-01 07:00:00 2010-01-01 7 -0.5 97
## 3 NYC 2010-01-01 16:00:00 2010-01-01 16 5 72
## 4 NYC 2010-01-01 20:00:00 2010-01-01 20 0.3 89
## 5 NYC 2010-01-02 08:00:00 2010-01-02 8 -2.4 70
## 6 NYC 2010-01-02 15:00:00 2010-01-02 15 -2.3 48
## 7 NYC 2010-01-02 23:00:00 2010-01-02 23 -9.2 50
## 8 NYC 2010-01-03 10:00:00 2010-01-03 10 -8.8 51
## 9 NYC 2010-01-03 12:00:00 2010-01-03 12 -7.6 47
## 10 NYC 2010-01-03 20:00:00 2010-01-03 20 -5.9 52
## # ℹ 90,830 more rows
## # ℹ 75 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## # pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## # rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## # cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## # direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## # diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
allCityTest %>% count(year, src) %>% pivot_wider(id_cols="year", names_from="src", values_from="n")
## # A tibble: 14 × 4
## year Chicago LA NYC
## <dbl> <int> <int> <int>
## 1 2010 2269 2221 2217
## 2 2011 2189 2193 2157
## 3 2012 2267 2208 2185
## 4 2013 2227 2222 2240
## 5 2014 2257 2087 2169
## 6 2015 2147 2196 2162
## 7 2016 2120 2184 2196
## 8 2017 2216 2128 2199
## 9 2018 2250 2142 2207
## 10 2019 2049 2166 2247
## 11 2020 2182 2267 2171
## 12 2021 2174 2220 2189
## 13 2022 2177 2233 2139
## 14 2023 2154 2211 1006
The process is run to predict city:
runFullRF(allCityTrain %>% mutate(fct_src=factor(src)) %>% filter(year<2022),
yVar="fct_src",
xVars=varsTrain %>% str_replace(pattern="pct_", replacement=""),
dfTest=allCityTest %>% mutate(fct_src=factor(src)) %>% filter(year==2022),
isContVar=FALSE,
returnData=FALSE
)
##
## Accuracy of test data is: 100%
Very high accuracy predictions are enabled by significant differences in deep soil moisture by city:
p1DF <- allCityTrain %>%
filter(year<2022) %>%
bind_rows(filter(allCityTest, year==2022), .id="df") %>%
mutate(surface_pressure=round(surface_pressure),
df=c("1"="1. Train (pre-2022)", "2"="2. Test (2022)")[df]
) %>%
count(df, src, soil_moisture_100_to_255cm, surface_pressure) %>%
group_by(df) %>%
mutate(wtdf=n/sum(n)) %>%
ungroup()
# Create base plot
p1DF %>%
ggplot(aes(x=soil_moisture_100_to_255cm, y=surface_pressure)) +
geom_point(aes(size=n, color=src), alpha=0.2) +
facet_wrap(~df)
# Create density plot (cannot create weighted?)
p1DF %>%
ggplot(aes(x=soil_moisture_100_to_255cm, y=surface_pressure)) +
geom_density_2d(aes(group=src, color=src)) +
geom_point(aes(size=n, color=src), alpha=0.1) +
facet_wrap(~df)
The model is re-run using only deep soil moisture, which maintains 100% accuracy with the test data:
runFullRF(allCityTrain %>% mutate(fct_src=factor(src)) %>% filter(year<2022),
yVar="fct_src",
xVars=c("soil_moisture_100_to_255cm"),
dfTest=allCityTest %>% mutate(fct_src=factor(src)) %>% filter(year==2022),
isContVar=FALSE,
returnData=FALSE
)
##
## Accuracy of test data is: 100%
The model is re-run excluding soil moisture, which maintains ~100% accuracy with the test data:
runFullRF(allCityTrain %>% mutate(fct_src=factor(src)) %>% filter(year<2022),
yVar="fct_src",
xVars=varsTrain[!grepl(pattern="moist", x=varsTrain)] %>% str_remove(., pattern="pct_"),
dfTest=allCityTest %>% mutate(fct_src=factor(src)) %>% filter(year==2022),
isContVar=FALSE,
returnData=FALSE
)
## Growing trees.. Progress: 76%. Estimated remaining time: 9 seconds.
##
## Accuracy of test data is: 99.985%
There is strong separation of cities by pressure:
# Surface pressure vs. soil temperature
allCityTrain %>%
select(src, soil_temperature_100_to_255cm, surface_pressure) %>%
mutate(surface_pressure=round(surface_pressure)) %>%
count(src, soil_temperature_100_to_255cm, surface_pressure) %>%
ggplot(aes(x=surface_pressure, y=soil_temperature_100_to_255cm)) +
geom_point(aes(color=src, size=n), alpha=0.1) +
geom_density_2d(aes(color=src), linewidth=1, alpha=0.5) +
labs(title="Relationships between pressure and deep soil temperature by city") +
scale_size_continuous("# Obs") +
scale_color_discrete("City")
# Surface pressure vs. MSL pressure
allCityTrain %>%
select(src, pressure_msl, surface_pressure) %>%
mutate(across(where(is.numeric), .fns=round)) %>%
count(src, pressure_msl, surface_pressure) %>%
ggplot(aes(x=surface_pressure, y=pressure_msl)) +
geom_point(aes(color=src, size=n), alpha=0.25) +
geom_smooth(method="lm", aes(color=src)) +
labs(title="Relationships between pressure by city") +
scale_size_continuous("# Obs") +
scale_color_discrete("City")
## `geom_smooth()` using formula = 'y ~ x'
All possible combinations of 2 variables are explored on a smaller dataset:
# Train and test data
dfTrainCity <- allCityTrain %>% mutate(fct_src=factor(src)) %>% filter(year<2022)
dfTestCity <- allCityTest %>% mutate(fct_src=factor(src)) %>% filter(year==2022)
# Variables to explore
possCityVars <- c(varsTrain %>% str_replace(pattern="pct_", replacement=""), "month", "tod")
# Subsets to use
set.seed(24051114)
idxSmallCity <- sample(1:nrow(dfTrainCity), 5000, replace=FALSE)
mtxSmallCity <- matrix(nrow=0, ncol=3)
for(idx1 in 1:(length(possCityVars)-1)) {
for(idx2 in (idx1+1):length(possCityVars)) {
r2SmallCity <- runFullRF(dfTrain=dfTrainCity[idxSmallCity,],
yVar="fct_src",
xVars=possCityVars[c(idx1, idx2)],
dfTest=dfTestCity,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=FALSE,
makePlots=FALSE,
returnData=TRUE
)[["rfAcc"]]
mtxSmallCity <- rbind(mtxSmallCity, c(idx1, idx2, r2SmallCity))
}
}
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.03%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.107%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.534%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.671%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 91.51%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.977%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.023%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.236%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.382%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.198%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.985%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.244%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.511%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.152%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.923%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.114%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.283%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.68%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.297%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.167%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.457%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.579%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.45%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.343%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.244%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.305%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.642%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.011%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.354%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.392%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.244%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.74%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.847%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.015%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.19%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.594%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.931%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.312%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.366%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.557%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.679%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.03%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.084%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.068%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.991%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.144%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.709%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 60.574%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.421%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.22%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.426%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.58%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.824%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.939%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.496%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.297%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.817%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.94%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.255%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.178%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.162%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.335%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.648%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.954%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.297%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 91.434%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.006%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.274%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.266%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.198%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.755%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.336%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.556%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.518%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.403%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.052%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.457%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.451%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.519%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.373%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.892%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.075%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.755%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.679%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.763%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.496%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.81%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.253%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.431%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.477%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 61.628%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.976%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.022%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.847%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 91.648%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.702%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.732%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.419%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.886%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.824%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.306%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.603%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.496%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.565%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.992%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.099%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.703%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.619%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.877%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.556%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.778%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.282%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.916%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.739%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.663%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.984%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.535%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.118%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.668%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.781%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.825%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.511%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.068%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.472%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.9%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.786%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.556%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.702%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.992%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.884%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.777%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.419%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.785%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.359%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.352%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.755%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.579%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.915%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.709%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.885%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.755%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.504%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.932%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.26%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.323%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.598%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.421%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.252%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.419%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.869%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.884%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.457%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 91.449%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.991%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 91.739%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.953%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.907%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.197%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.472%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.831%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.716%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.243%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.663%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.693%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.647%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.289%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.029%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 91.602%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 92.61%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 92.716%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 92.426%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.74%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 92.64%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 91.571%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.411%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 90.273%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.716%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.036%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.098%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.586%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.152%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.304%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.372%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.204%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.815%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.914%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.825%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.695%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.823%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.167%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.915%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.075%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.297%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.335%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.489%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.153%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.062%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.332%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.713%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.437%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.502%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.334%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.265%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.74%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.678%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.862%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.907%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.67%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.372%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.861%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.151%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.993%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.711%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.136%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.571%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.129%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.121%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.389%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.458%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.619%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.23%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.133%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.713%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 63.674%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.723%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.479%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.694%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.96%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.916%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.037%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.082%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.616%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.54%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.403%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.153%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.413%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.541%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.342%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.244%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.709%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.121%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.022%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.344%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.924%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.948%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.522%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.339%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.369%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.158%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.264%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.953%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.541%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.762%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.526%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.755%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.785%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.556%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.886%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.336%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.015%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.388%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.725%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.473%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.244%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.069%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.176%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.123%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.087%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.759%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.597%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.266%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.945%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.969%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.029%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.105%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.67%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.808%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.22%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.794%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.397%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.716%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.09%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.121%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.396%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.9%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.87%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.801%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.916%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.215%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.446%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.163%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 63.033%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.891%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.296%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.549%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.251%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.389%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.236%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.045%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.566%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.566%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.076%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.335%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.145%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.168%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.427%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.428%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.268%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.039%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.856%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.354%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.606%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.59%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.168%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.564%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.984%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.862%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.029%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.991%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.909%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.268%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.518%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.48%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.633%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.159%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.297%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.847%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.932%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.695%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.191%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.034%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.896%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.551%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.686%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.716%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.09%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.013%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.868%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.299%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.504%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.22%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.594%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.045%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.35%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.121%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.892%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.595%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.42%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.214%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.598%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.14%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.047%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.563%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.998%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.051%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.334%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.512%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.596%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.205%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.854%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.847%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.518%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.648%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.602%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.503%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.588%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.94%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.507%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.14%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.841%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.708%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.578%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.952%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.955%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.367%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.869%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.847%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.686%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.213%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.412%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.427%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.732%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.496%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.054%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.675%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.018%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.1%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.479%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.128%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.245%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.504%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.144%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.946%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.526%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.305%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.114%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.144%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.542%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.763%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.856%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.262%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 66.621%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.33%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.769%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.464%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.589%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.047%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.581%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.362%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.864%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.749%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.443%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.238%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.726%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 61.414%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.446%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.355%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.369%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.207%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.344%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.031%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.779%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.528%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.367%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.527%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.741%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.596%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.383%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.536%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.683%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.286%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.018%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.161%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.481%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.152%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.992%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.144%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.06%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.198%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.145%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.321%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.222%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.774%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 66.209%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.643%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.335%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.518%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.007%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.984%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.93%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.198%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.091%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.458%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.573%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.552%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.155%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.39%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.739%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.144%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.603%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.038%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.785%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.938%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.244%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.932%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.003%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 66.896%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 57.871%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.617%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.289%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.916%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.969%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.603%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.763%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.955%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.774%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.171%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.894%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.709%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 39.762%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.435%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.259%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.985%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.1%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.438%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.468%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 60.712%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.694%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.205%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.526%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.182%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.436%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.858%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.668%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.353%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.542%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.839%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.443%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.635%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.546%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.142%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.912%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.199%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.939%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.681%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.507%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.897%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.492%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 63.384%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.565%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.286%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.14%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.783%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.66%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.993%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 76.271%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.751%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.851%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.736%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.407%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.095%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.377%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.95%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.285%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.035%
Predictive success by metric is explored:
dfSmallR2City <- as.data.frame(mtxSmallCity) %>%
purrr::set_names(c("idx1", "idx2", "r2")) %>%
tibble::as_tibble() %>%
mutate(var1=possCityVars[idx1], var2=possCityVars[idx2], rn=row_number())
dfSmallR2City %>% arrange(desc(r2)) %>% select(var1, var2, r2) %>% print(n=20)
## # A tibble: 561 × 3
## var1 var2 r2
## <chr> <chr> <dbl>
## 1 temperature_2m soil_moisture_100_to_255cm 1
## 2 relativehumidity_2m soil_moisture_100_to_255cm 1
## 3 dewpoint_2m soil_moisture_100_to_255cm 1
## 4 apparent_temperature soil_moisture_100_to_255cm 1
## 5 pressure_msl surface_pressure 1
## 6 pressure_msl soil_moisture_100_to_255cm 1
## 7 surface_pressure soil_moisture_100_to_255cm 1
## 8 precipitation soil_moisture_100_to_255cm 1
## 9 rain soil_moisture_100_to_255cm 1
## 10 snowfall soil_moisture_100_to_255cm 1
## 11 cloudcover soil_moisture_100_to_255cm 1
## 12 cloudcover_low soil_moisture_100_to_255cm 1
## 13 cloudcover_mid soil_moisture_100_to_255cm 1
## 14 cloudcover_high soil_moisture_100_to_255cm 1
## 15 shortwave_radiation soil_moisture_100_to_255cm 1
## 16 direct_radiation soil_moisture_100_to_255cm 1
## 17 direct_normal_irradiance soil_moisture_100_to_255cm 1
## 18 diffuse_radiation soil_moisture_100_to_255cm 1
## 19 windspeed_10m soil_moisture_100_to_255cm 1
## 20 windspeed_100m soil_moisture_100_to_255cm 1
## # ℹ 541 more rows
dfSmallR2City %>%
pivot_longer(cols=c(var1, var2)) %>%
group_by(value) %>%
summarize(across(r2, .fns=list("min"=min, "mu"=mean, "max"=max))) %>%
ggplot(aes(x=fct_reorder(value, r2_mu))) +
coord_flip() +
geom_point(aes(y=r2_mu)) +
geom_errorbar(aes(ymin=r2_min, ymax=r2_max)) +
lims(y=c(NA, 1)) +
geom_hline(yintercept=1, lty=2, color="red") +
labs(title="Accuracy in every 2-predictor model including self and one other",
subtitle="Predicting city",
y="Range of accuracy (min-mean-max)",
x=NULL
)
dfSmallR2City %>%
arrange(desc(r2)) %>%
filter(var2!="soil_moisture_100_to_255cm", var1!="soil_moisture_100_to_255cm") %>%
select(var1, var2, r2) %>%
print(n=20)
## # A tibble: 528 × 3
## var1 var2 r2
## <chr> <chr> <dbl>
## 1 pressure_msl surface_pressure 1
## 2 surface_pressure soil_moisture_0_to_7cm 0.937
## 3 surface_pressure soil_temperature_28_to_100cm 0.927
## 4 surface_pressure soil_moisture_7_to_28cm 0.926
## 5 surface_pressure soil_temperature_7_to_28cm 0.926
## 6 surface_pressure soil_temperature_100_to_255cm 0.924
## 7 surface_pressure cloudcover_mid 0.917
## 8 apparent_temperature surface_pressure 0.916
## 9 surface_pressure soil_temperature_0_to_7cm 0.916
## 10 surface_pressure soil_moisture_28_to_100cm 0.916
## 11 temperature_2m surface_pressure 0.915
## 12 surface_pressure cloudcover 0.914
## 13 dewpoint_2m surface_pressure 0.914
## 14 surface_pressure cloudcover_low 0.910
## 15 surface_pressure rain 0.909
## 16 surface_pressure precipitation 0.909
## 17 surface_pressure windspeed_10m 0.907
## 18 relativehumidity_2m surface_pressure 0.906
## 19 surface_pressure direct_normal_irradiance 0.905
## 20 surface_pressure snowfall 0.905
## # ℹ 508 more rows
As seen previously, deep soil moisture perfectly differentiates the three cities. Excluding deep soil moisture, the combination of surface pressure and MSL pressure (highly related to the city’s altitude above sea level) also perfectly predicts city
Select combinations are explored using the full training dataset:
possLargeVars <- dfSmallR2City %>%
filter(var1!="soil_moisture_100_to_255cm", var2!="soil_moisture_100_to_255cm") %>%
arrange(desc(r2)) %>%
filter(r2>=0.925) %>%
select(r2, rn, var1, var2) %>%
pivot_longer(cols=c(var1, var2)) %>%
pull(value) %>%
unique()
possLargeVars
## [1] "pressure_msl" "surface_pressure"
## [3] "soil_moisture_0_to_7cm" "soil_temperature_28_to_100cm"
## [5] "soil_moisture_7_to_28cm" "soil_temperature_7_to_28cm"
mtxLarge <- matrix(nrow=0, ncol=3)
for(idx1 in 1:(length(possLargeVars)-1)) {
for(idx2 in (idx1+1):length(possLargeVars)) {
r2LargeCity <- runFullRF(dfTrain=dfTrainCity[,],
yVar="fct_src",
xVars=possLargeVars[c(idx1, idx2)],
dfTest=dfTestCity,
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
isContVar=FALSE,
makePlots=FALSE,
returnData=TRUE
)[["rfAcc"]]
mtxLarge <- rbind(mtxLarge, c(idx1, idx2, r2LargeCity))
}
}
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.224%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.55%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.743%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.352%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.923%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.419%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 92.625%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.129%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 74.469%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 74.714%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.874%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 78.012%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.611%
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.401%
dfLargeR2City <- as.data.frame(mtxLarge) %>%
purrr::set_names(c("idx1", "idx2", "r2")) %>%
tibble::as_tibble() %>%
mutate(var1=possLargeVars[idx1], var2=possLargeVars[idx2], rn=row_number())
dfLargeR2City %>% arrange(desc(r2)) %>% select(var1, var2, r2) %>% print(n=20)
## # A tibble: 15 × 3
## var1 var2 r2
## <chr> <chr> <dbl>
## 1 pressure_msl surface_pressure 1
## 2 surface_pressure soil_moisture_0_to_7cm 0.939
## 3 surface_pressure soil_temperature_28_to_100cm 0.934
## 4 surface_pressure soil_temperature_7_to_28cm 0.931
## 5 surface_pressure soil_moisture_7_to_28cm 0.926
## 6 soil_moisture_7_to_28cm soil_temperature_7_to_28cm 0.794
## 7 soil_temperature_28_to_100cm soil_moisture_7_to_28cm 0.780
## 8 soil_moisture_0_to_7cm soil_temperature_7_to_28cm 0.779
## 9 soil_moisture_0_to_7cm soil_moisture_7_to_28cm 0.747
## 10 soil_moisture_0_to_7cm soil_temperature_28_to_100cm 0.745
## 11 pressure_msl soil_moisture_0_to_7cm 0.702
## 12 pressure_msl soil_moisture_7_to_28cm 0.687
## 13 pressure_msl soil_temperature_28_to_100cm 0.556
## 14 pressure_msl soil_temperature_7_to_28cm 0.554
## 15 soil_temperature_28_to_100cm soil_temperature_7_to_28cm 0.536
Daily and hourly data are downloaded for Houston, cached to avoid multiple hits to the server:
# Hourly data download for Houston, TX
testURLHourly <- helperOpenMeteoURL(cityName="Houston TX",
hourlyIndices=1:nrow(tblMetricsHourly),
startDate="2010-01-01",
endDate="2023-12-31",
tz="US/Central"
)
##
## Hourly metrics created from indices: temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm
testURLHourly
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=29.77&longitude=-95.39&start_date=2010-01-01&end_date=2023-12-31&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm&timezone=US%2FCentral"
# Download file
if(!file.exists("testOM_hourly_hou.json")) {
fileDownload(fileName="testOM_hourly_hou.json", url=testURLHourly)
} else {
cat("\nFile testOM_hourly_hou.json already exists, skipping download\n")
}
##
## File testOM_hourly_hou.json already exists, skipping download
# Daily data download for Houston, TX
testURLDaily <- helperOpenMeteoURL(cityName="Houston TX",
dailyIndices=1:nrow(tblMetricsDaily),
startDate="2010-01-01",
endDate="2023-12-31",
tz="US/Central"
)
##
## Daily metrics created from indices: weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
testURLDaily
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=29.77&longitude=-95.39&start_date=2010-01-01&end_date=2023-12-31&daily=weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration&timezone=US%2FCentral"
# Download file
if(!file.exists("testOM_daily_hou.json")) {
fileDownload(fileName="testOM_daily_hou.json", url=testURLDaily)
} else {
cat("\nFile testOM_daily_hou.json already exists, skipping download\n")
}
##
## File testOM_daily_hou.json already exists, skipping download
Core datasets for Houston are loaded, with explanatory variables added for future processing:
# Read daily JSON file
houOMDaily <- readOpenMeteoJSON("testOM_daily_hou.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, daily_units, daily
houOMDaily
## $tblDaily
## # A tibble: 5,113 × 18
## date time weathercode temperature_2m_max temperature_2m_min
## <date> <chr> <int> <dbl> <dbl>
## 1 2010-01-01 2010-01-01 3 11.8 3.9
## 2 2010-01-02 2010-01-02 1 12 0.7
## 3 2010-01-03 2010-01-03 3 10 4.4
## 4 2010-01-04 2010-01-04 3 7.6 1.8
## 5 2010-01-05 2010-01-05 0 8 -1.9
## 6 2010-01-06 2010-01-06 51 12.7 -0.1
## 7 2010-01-07 2010-01-07 55 13.4 -0.2
## 8 2010-01-08 2010-01-08 2 0.8 -3
## 9 2010-01-09 2010-01-09 0 4.4 -5.5
## 10 2010-01-10 2010-01-10 0 5.9 -4.6
## # ℹ 5,103 more rows
## # ℹ 13 more variables: apparent_temperature_max <dbl>,
## # apparent_temperature_min <dbl>, precipitation_sum <dbl>, rain_sum <dbl>,
## # snowfall_sum <dbl>, precipitation_hours <dbl>, sunrise <chr>, sunset <chr>,
## # windspeed_10m_max <dbl>, windgusts_10m_max <dbl>,
## # winddirection_10m_dominant <int>, shortwave_radiation_sum <dbl>,
## # et0_fao_evapotranspiration <dbl>
##
## $tblHourly
## NULL
##
## $tblUnits
## # A tibble: 17 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 daily_units time "iso8601" <NA>
## 2 daily_units weathercode "wmo code" The most severe weather co…
## 3 daily_units temperature_2m_max "deg C" Maximum and minimum daily …
## 4 daily_units temperature_2m_min "deg C" Maximum and minimum daily …
## 5 daily_units apparent_temperature_max "deg C" Maximum and minimum daily …
## 6 daily_units apparent_temperature_min "deg C" Maximum and minimum daily …
## 7 daily_units precipitation_sum "mm" Sum of daily precipitation…
## 8 daily_units rain_sum "mm" Sum of daily rain
## 9 daily_units snowfall_sum "cm" Sum of daily snowfall
## 10 daily_units precipitation_hours "h" The number of hours with r…
## 11 daily_units sunrise "iso8601" Sun rise and set times
## 12 daily_units sunset "iso8601" Sun rise and set times
## 13 daily_units windspeed_10m_max "km/h" Maximum wind speed and gus…
## 14 daily_units windgusts_10m_max "km/h" Maximum wind speed and gus…
## 15 daily_units winddirection_10m_dominant "deg " Dominant wind direction
## 16 daily_units shortwave_radiation_sum "MJ/m²" The sum of solar radiaion …
## 17 daily_units et0_fao_evapotranspiration "mm" Daily sum of ET0 Reference…
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seconds timezone
## <dbl> <dbl> <dbl> <int> <chr>
## 1 29.8 -95.4 64.0 -18000 US/Central
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
prettyOpenMeteoMeta(houOMDaily)
##
## latitude: 29.77153
## longitude: -95.43555
## generationtime_ms: 63.96198
## utc_offset_seconds: -18000
## timezone: US/Central
## timezone_abbreviation: CDT
## elevation: 17
# Read hourly JSON file
houOMHourly <- readOpenMeteoJSON("testOM_hourly_hou.json")
##
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly
houOMHourly
## $tblDaily
## NULL
##
## $tblHourly
## # A tibble: 122,712 × 37
## time date hour temperature_2m relativehumidity_2m
## <dttm> <date> <int> <dbl> <int>
## 1 2010-01-01 00:00:00 2010-01-01 0 10.9 93
## 2 2010-01-01 01:00:00 2010-01-01 1 9.9 92
## 3 2010-01-01 02:00:00 2010-01-01 2 8.6 88
## 4 2010-01-01 03:00:00 2010-01-01 3 7.7 86
## 5 2010-01-01 04:00:00 2010-01-01 4 7.2 85
## 6 2010-01-01 05:00:00 2010-01-01 5 6.8 84
## 7 2010-01-01 06:00:00 2010-01-01 6 6.4 82
## 8 2010-01-01 07:00:00 2010-01-01 7 5.9 83
## 9 2010-01-01 08:00:00 2010-01-01 8 5.6 83
## 10 2010-01-01 09:00:00 2010-01-01 9 5.5 82
## # ℹ 122,702 more rows
## # ℹ 32 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## # pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## # rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## # cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## # direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## # diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
##
## $tblUnits
## # A tibble: 34 × 4
## metricType name value description
## <chr> <chr> <chr> <chr>
## 1 hourly_units time iso8601 <NA>
## 2 hourly_units temperature_2m deg C Air temperature at 2 meters above …
## 3 hourly_units relativehumidity_2m % Relative humidity at 2 meters abov…
## 4 hourly_units dewpoint_2m deg C Dew point temperature at 2 meters …
## 5 hourly_units apparent_temperature deg C Apparent temperature is the percei…
## 6 hourly_units pressure_msl hPa Atmospheric air pressure reduced t…
## 7 hourly_units surface_pressure hPa Atmospheric air pressure reduced t…
## 8 hourly_units precipitation mm Total precipitation (rain, showers…
## 9 hourly_units rain mm Only liquid precipitation of the p…
## 10 hourly_units snowfall cm Snowfall amount of the preceding h…
## # ℹ 24 more rows
##
## $tblDescription
## # A tibble: 1 × 7
## latitude longitude generationtime_ms utc_offset_seconds timezone
## <dbl> <dbl> <dbl> <int> <chr>
## 1 29.8 -95.4 3762. -18000 US/Central
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
prettyOpenMeteoMeta(houOMHourly)
##
## latitude: 29.77153
## longitude: -95.43555
## generationtime_ms: 3762.283
## utc_offset_seconds: -18000
## timezone: US/Central
## timezone_abbreviation: CDT
## elevation: 17
# Create percentiles for numeric variables
houTemp <- houOMHourly$tblHourly %>%
mutate(year=year(date),
month=factor(month.abb[lubridate::month(date)], levels=month.abb),
hour=lubridate::hour(time),
fct_hour=factor(hour),
tod=ifelse(hour>=7 & hour<=18, "Day", "Night"),
doy=yday(date),
season=case_when(month %in% c("Mar", "Apr", "May") ~ "Spring",
month %in% c("Jun", "Jul", "Aug") ~ "Summer",
month %in% c("Sep", "Oct", "Nov") ~ "Fall",
month %in% c("Dec", "Jan", "Feb") ~ "Winter",
TRUE~"typo"
),
todSeason=paste0(season, "-", tod),
tod=factor(tod, levels=c("Day", "Night")),
season=factor(season, levels=c("Spring", "Summer", "Fall", "Winter")),
todSeason=factor(todSeason,
levels=paste0(rep(c("Spring", "Summer", "Fall", "Winter"), each=2),
"-",
c("Day", "Night")
)
),
across(where(is.numeric), .fns=function(x) round(100*percent_rank(x)), .names="pct_{.col}")
)
glimpse(houTemp)
## Rows: 122,712
## Columns: 80
## $ time <dttm> 2010-01-01 00:00:00, 2010-01-01 01:…
## $ date <date> 2010-01-01, 2010-01-01, 2010-01-01,…
## $ hour <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ temperature_2m <dbl> 10.9, 9.9, 8.6, 7.7, 7.2, 6.8, 6.4, …
## $ relativehumidity_2m <int> 93, 92, 88, 86, 85, 84, 82, 83, 83, …
## $ dewpoint_2m <dbl> 9.8, 8.6, 6.7, 5.6, 4.8, 4.2, 3.6, 3…
## $ apparent_temperature <dbl> 7.4, 5.7, 4.1, 3.2, 2.9, 2.4, 2.2, 1…
## $ pressure_msl <dbl> 1025.2, 1025.9, 1026.8, 1027.1, 1027…
## $ surface_pressure <dbl> 1023.1, 1023.8, 1024.7, 1025.0, 1025…
## $ precipitation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rain <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ snowfall <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cloudcover <int> 90, 90, 88, 88, 89, 89, 86, 80, 90, …
## $ cloudcover_low <int> 100, 100, 98, 98, 99, 99, 96, 89, 10…
## $ cloudcover_mid <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cloudcover_high <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ shortwave_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 89, 1…
## $ direct_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 28, 58…
## $ direct_normal_irradiance <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
## $ diffuse_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 61, 1…
## $ windspeed_10m <dbl> 24.0, 25.9, 25.3, 23.5, 20.9, 20.7, …
## $ windspeed_100m <dbl> 37.4, 39.1, 38.4, 35.4, 32.0, 31.2, …
## $ winddirection_10m <int> 330, 333, 336, 339, 341, 340, 347, 3…
## $ winddirection_100m <int> 332, 334, 337, 341, 343, 341, 347, 3…
## $ windgusts_10m <dbl> 44.3, 46.1, 46.8, 44.3, 41.0, 37.8, …
## $ et0_fao_evapotranspiration <dbl> 0.00, 0.01, 0.01, 0.01, 0.02, 0.02, …
## $ weathercode <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, …
## $ vapor_pressure_deficit <dbl> 0.10, 0.10, 0.14, 0.14, 0.16, 0.16, …
## $ soil_temperature_0_to_7cm <dbl> 11.9, 11.5, 11.0, 10.5, 10.1, 9.8, 9…
## $ soil_temperature_7_to_28cm <dbl> 12.3, 12.3, 12.2, 12.2, 12.1, 12.0, …
## $ soil_temperature_28_to_100cm <dbl> 14.2, 14.2, 14.2, 14.2, 14.2, 14.2, …
## $ soil_temperature_100_to_255cm <dbl> 20.9, 20.9, 20.9, 20.9, 20.9, 20.9, …
## $ soil_moisture_0_to_7cm <dbl> 0.462, 0.462, 0.462, 0.462, 0.462, 0…
## $ soil_moisture_7_to_28cm <dbl> 0.474, 0.474, 0.474, 0.474, 0.473, 0…
## $ soil_moisture_28_to_100cm <dbl> 0.498, 0.498, 0.498, 0.498, 0.498, 0…
## $ soil_moisture_100_to_255cm <dbl> 0.453, 0.453, 0.453, 0.453, 0.453, 0…
## $ origTime <chr> "2010-01-01T00:00", "2010-01-01T01:0…
## $ year <dbl> 2010, 2010, 2010, 2010, 2010, 2010, …
## $ month <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, J…
## $ fct_hour <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ tod <fct> Night, Night, Night, Night, Night, N…
## $ doy <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ season <fct> Winter, Winter, Winter, Winter, Wint…
## $ todSeason <fct> Winter-Night, Winter-Night, Winter-N…
## $ pct_hour <dbl> 0, 4, 8, 13, 17, 21, 25, 29, 33, 38,…
## $ pct_temperature_2m <dbl> 12, 10, 8, 6, 6, 5, 5, 4, 4, 4, 4, 5…
## $ pct_relativehumidity_2m <dbl> 80, 77, 67, 63, 61, 59, 55, 57, 57, …
## $ pct_dewpoint_2m <dbl> 23, 21, 17, 15, 13, 12, 11, 10, 9, 9…
## $ pct_apparent_temperature <dbl> 11, 9, 6, 5, 5, 4, 4, 4, 4, 3, 3, 4,…
## $ pct_pressure_msl <dbl> 92, 93, 94, 95, 96, 97, 97, 97, 97, …
## $ pct_surface_pressure <dbl> 92, 93, 94, 95, 96, 97, 97, 97, 98, …
## $ pct_precipitation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_rain <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_snowfall <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_cloudcover <dbl> 80, 80, 79, 79, 79, 79, 78, 76, 80, …
## $ pct_cloudcover_low <dbl> 89, 89, 87, 87, 88, 88, 86, 84, 89, …
## $ pct_cloudcover_mid <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_cloudcover_high <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_shortwave_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 51, 59, 6…
## $ pct_direct_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, 61, 6…
## $ pct_direct_normal_irradiance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 54, 63, 6…
## $ pct_diffuse_radiation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, 60, 7…
## $ pct_windspeed_10m <dbl> 95, 97, 96, 94, 90, 89, 83, 79, 78, …
## $ pct_windspeed_100m <dbl> 96, 97, 97, 95, 90, 89, 82, 78, 76, …
## $ pct_winddirection_10m <dbl> 91, 92, 92, 93, 93, 93, 95, 98, 96, …
## $ pct_winddirection_100m <dbl> 92, 92, 93, 94, 94, 94, 96, 99, 97, …
## $ pct_windgusts_10m <dbl> 94, 96, 96, 94, 91, 87, 87, 84, 77, …
## $ pct_et0_fao_evapotranspiration <dbl> 0, 24, 24, 24, 32, 32, 32, 24, 24, 3…
## $ pct_weathercode <dbl> 69, 69, 69, 69, 69, 69, 69, 69, 69, …
## $ pct_vapor_pressure_deficit <dbl> 10, 10, 16, 16, 19, 19, 20, 19, 19, …
## $ pct_soil_temperature_0_to_7cm <dbl> 10, 9, 8, 7, 6, 6, 5, 4, 4, 4, 4, 5,…
## $ pct_soil_temperature_7_to_28cm <dbl> 6, 6, 6, 6, 6, 6, 5, 5, 5, 4, 4, 4, …
## $ pct_soil_temperature_28_to_100cm <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ pct_soil_temperature_100_to_255cm <dbl> 38, 38, 38, 38, 38, 38, 38, 38, 38, …
## $ pct_soil_moisture_0_to_7cm <dbl> 82, 82, 82, 82, 82, 82, 82, 82, 82, …
## $ pct_soil_moisture_7_to_28cm <dbl> 88, 88, 88, 88, 88, 88, 88, 88, 88, …
## $ pct_soil_moisture_28_to_100cm <dbl> 98, 98, 98, 98, 98, 98, 98, 98, 98, …
## $ pct_soil_moisture_100_to_255cm <dbl> 82, 82, 82, 82, 82, 82, 82, 82, 82, …
## $ pct_year <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_doy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
houTemp %>%
count(doy, month) %>%
ggplot(aes(y=doy, x=month)) +
geom_boxplot(aes(weight=n), fill="lightblue") +
labs(title="Observations by day-of-year and month", x=NULL, y="Day of Year")
houTemp %>%
count(year, month) %>%
ggplot(aes(y=factor(year), x=month)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), size=3) +
scale_fill_continuous("# Records", low="white", high="green") +
labs(title="Records by year and month", x=NULL, y=NULL)
houTemp %>% count(todSeason, season, tod)
## # A tibble: 8 × 4
## todSeason season tod n
## <fct> <fct> <fct> <int>
## 1 Spring-Day Spring Day 15456
## 2 Spring-Night Spring Night 15456
## 3 Summer-Day Summer Day 15456
## 4 Summer-Night Summer Night 15456
## 5 Fall-Day Fall Day 15288
## 6 Fall-Night Fall Night 15288
## 7 Winter-Day Winter Day 15156
## 8 Winter-Night Winter Night 15156
houTemp %>% count(hour, fct_hour, tod) %>% print(n=30)
## # A tibble: 24 × 4
## hour fct_hour tod n
## <int> <fct> <fct> <int>
## 1 0 0 Night 5113
## 2 1 1 Night 5113
## 3 2 2 Night 5113
## 4 3 3 Night 5113
## 5 4 4 Night 5113
## 6 5 5 Night 5113
## 7 6 6 Night 5113
## 8 7 7 Day 5113
## 9 8 8 Day 5113
## 10 9 9 Day 5113
## 11 10 10 Day 5113
## 12 11 11 Day 5113
## 13 12 12 Day 5113
## 14 13 13 Day 5113
## 15 14 14 Day 5113
## 16 15 15 Day 5113
## 17 16 16 Day 5113
## 18 17 17 Day 5113
## 19 18 18 Day 5113
## 20 19 19 Night 5113
## 21 20 20 Night 5113
## 22 21 21 Night 5113
## 23 22 22 Night 5113
## 24 23 23 Night 5113
houTemp %>% count(month, season)
## # A tibble: 12 × 3
## month season n
## <fct> <fct> <int>
## 1 Jan Winter 10416
## 2 Feb Winter 9480
## 3 Mar Spring 10416
## 4 Apr Spring 10080
## 5 May Spring 10416
## 6 Jun Summer 10080
## 7 Jul Summer 10416
## 8 Aug Summer 10416
## 9 Sep Fall 10080
## 10 Oct Fall 10416
## 11 Nov Fall 10080
## 12 Dec Winter 10416
Distributions of several key variables are explored:
keyVars <- c('temperature_2m',
'relativehumidity_2m',
'dewpoint_2m',
'shortwave_radiation',
'vapor_pressure_deficit',
'soil_temperature_28_to_100cm',
'soil_temperature_100_to_255cm',
'soil_moisture_28_to_100cm',
'soil_moisture_100_to_255cm'
)
laxTemp %>%
colSelector(vecSelect=keyVars) %>%
bind_rows(colSelector(chiTemp, vecSelect=keyVars),
colSelector(nycTemp, vecSelect=keyVars),
colSelector(houTemp, vecSelect=keyVars),
.id="src"
) %>%
mutate(cty=c("1"="LA", "2"="Chicago", "3"="NYC", "4"="Houston")[src]) %>%
pivot_longer(cols=-c(src, cty)) %>%
ggplot(aes(x=cty, y=value)) +
geom_boxplot(aes(fill=cty)) +
facet_wrap(~name, scales="free_y") +
labs(x=NULL, y=NULL, title="Distribution of Key Metrics by City") +
scale_fill_discrete(NULL)
The scatter of temperature and dewpoint is also explored:
cVec <- c("1"="LA", "2"="NYC", "3"="Chicago", "4"="Houston")
laxTemp %>%
bind_rows(nycTemp, chiTemp, houTemp, .id="src") %>%
select(t=temperature_2m, d=dewpoint_2m, src) %>%
mutate(across(.cols=where(is.numeric), .fns=function(x) round(x))) %>%
count(src, t, d) %>%
ggplot(aes(x=t, y=d)) +
geom_point(aes(size=n, color=cVec[src]), alpha=0.5) +
geom_smooth(aes(color=cVec[src], weight=n), method="lm") +
labs(x="Temperature (C)", y="Dewpoint (C)", title="Temperature vs. Dewpoint", subtitle="Hourly") +
scale_color_discrete(NULL) +
scale_size_continuous("# Obs")
## `geom_smooth()` using formula = 'y ~ x'
laxTemp %>%
bind_rows(nycTemp, chiTemp, houTemp, .id="src") %>%
mutate(src=cVec[src]) %>%
group_by(src) %>%
summarize(cor_td=cor(temperature_2m, dewpoint_2m))
## # A tibble: 4 × 2
## src cor_td
## <chr> <dbl>
## 1 Chicago 0.950
## 2 Houston 0.834
## 3 LA 0.273
## 4 NYC 0.919
Houston is similar to NYC and Chicago in having a largely linear relationship between temperature and dewpoint.
The previous models for predicting city (one with soil temperature, one without) are saved, for application to the new Houston data:
# Run with all variables
rfCityFull <- runFullRF(allCityTrain %>% mutate(fct_src=factor(src)) %>% filter(year<2022),
yVar="fct_src",
xVars=varsTrain %>% str_replace(pattern="pct_", replacement=""),
dfTest=allCityTest %>% mutate(fct_src=factor(src)) %>% filter(year==2022),
isContVar=FALSE,
returnData=TRUE
)
##
## Accuracy of test data is: 100%
predictRF(rfCityFull$rf, df=houTemp) %>% count(pred)
## # A tibble: 1 × 2
## pred n
## <fct> <int>
## 1 NYC 122712
# Run without moisture variables
rfCityNoMoisture <- runFullRF(allCityTrain %>% mutate(fct_src=factor(src)) %>% filter(year<2022),
yVar="fct_src",
xVars=varsTrain[!grepl(pattern="moist", x=varsTrain)] %>%
str_remove(., pattern="pct_"),
dfTest=allCityTest %>% mutate(fct_src=factor(src)) %>% filter(year==2022),
isContVar=FALSE,
returnData=TRUE
)
## Growing trees.. Progress: 99%. Estimated remaining time: 0 seconds.
##
## Accuracy of test data is: 99.969%
predictRF(rfCityNoMoisture$rf, df=houTemp) %>% count(pred)
## # A tibble: 1 × 2
## pred n
## <fct> <int>
## 1 NYC 122712
The previously trained random forest models overwhelmingly predict Houston as NYC rather than Chicago or LA
The linear approximation for estimating temperature based on dewpoint and relative humidity is applied:
ggMiniTempHOU <- predict(lmMiniTemp, newdata=houTemp %>% select(rh=relativehumidity_2m, d=dewpoint_2m)) %>%
mutate(select(houTemp, temperature_2m),
pred=.,
err=pred-temperature_2m,
err2=err**2,
rnd5=round(temperature_2m/5)*5
) %>%
group_by(rnd5) %>%
summarize(n=n(), across(.cols=where(is.numeric), .fns=mean))
ggMiniTempHOU
## # A tibble: 11 × 6
## rnd5 n temperature_2m pred err err2
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -10 28 -8.87 -8.46 0.415 0.305
## 2 -5 194 -3.97 -3.72 0.254 0.290
## 3 0 1416 0.788 0.826 0.0378 0.287
## 4 5 5956 5.36 5.39 0.0294 0.268
## 5 10 12122 10.2 10.2 0.0158 0.327
## 6 15 16238 15.2 15.1 -0.120 0.519
## 7 20 25720 20.2 20.0 -0.136 0.540
## 8 25 36145 25.0 25.1 0.0655 0.553
## 9 30 19555 29.6 29.8 0.205 1.13
## 10 35 5047 34.2 33.0 -1.26 3.83
## 11 40 291 38.4 34.2 -4.14 19.5
ggMiniTempHOU %>%
select(rnd5, temperature_2m, pred) %>%
pivot_longer(cols=-c(rnd5)) %>%
ggplot(aes(x=rnd5, y=value)) +
geom_line(aes(group=name,
color=c("pred"="Predicted Mean", "temperature_2m"="Actual Mean")[name]
)
) +
labs(title="Actual vs. Predicted Temperature Using Old City Linear Model on New City Data",
x="New city actual temperature (rounded to nearest 5)",
y="Average temperature for metric"
) +
scale_color_discrete("Metric") +
geom_abline(slope=1, intercept=0, lty=2)
The linear approximation based on dewpoint and relative humidity in NYC is generally accurate for predicting temperatures in Houston, consistent with Houston having T/D trends similar to NYC and Chicago. Houston gets a bit hotter than these cities, driving temperature under-prediction on the very hottest Houston days
The existing random forest model is applied to make predictions for Houston:
newPredsActualHOU <- newCityPredict(rfTemp2mBest3Actual,
dfTest=houTemp,
trueCol="temperature_2m",
isContVar=TRUE,
rndTo=0.5,
refXY=TRUE
)[["tstPred"]][,c("temperature_2m", "pred")]
##
## R-squared of requested data is: 99.62% (RMSE 0.48 vs. 7.77 null)
## `geom_smooth()` using formula = 'y ~ x'
tmpGGActualHOU <- newPredsActualHOU %>%
mutate(act5=round(temperature_2m/5)*5, err=pred-temperature_2m, err2=err**2) %>%
group_by(act5) %>%
summarize(across(.cols=where(is.numeric), .fns=function(x) mean(x)), n=n())
tmpGGActualHOU
## # A tibble: 11 × 6
## act5 temperature_2m pred err err2 n
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 -10 -8.87 -8.81 0.0637 0.0280 28
## 2 -5 -3.97 -3.89 0.0794 0.0296 194
## 3 0 0.788 0.821 0.0325 0.0217 1416
## 4 5 5.36 5.36 0.00660 0.0139 5956
## 5 10 10.2 10.2 -0.000255 0.0138 12122
## 6 15 15.2 15.2 -0.00274 0.0180 16238
## 7 20 20.2 20.1 -0.0650 0.0557 25720
## 8 25 25.0 24.6 -0.430 0.565 36145
## 9 30 29.6 29.3 -0.249 0.180 19555
## 10 35 34.2 33.9 -0.340 0.243 5047
## 11 40 38.4 36.7 -1.61 3.25 291
tmpGGActualHOU %>%
select(act5, temperature_2m, pred) %>%
pivot_longer(cols=-c(act5)) %>%
ggplot(aes(x=act5, y=value)) +
geom_line(aes(group=name,
color=c("pred"="Predicted Mean", "temperature_2m"="Actual Mean")[name]
)
) +
labs(title="Actual vs. Predicted Temperature Using Existing Model on Fourth City Data",
x="Fourth city actual temperature (rounded to nearest 5)",
y="Average temperature for metric"
) +
scale_color_discrete("Metric") +
geom_abline(slope=1, intercept=0, lty=2)
Predictions are generally accurate as the range of temperatures and dewpoints in Houston are generally well represented in the existing NYC an Chicago data
Functions are applied to predict month for the new city, once using the new city’s data and once using the old city’s data:
# Split new city data in to test and train data (3:1 split in favor of test)
set.seed(24052114)
idxTrainHOU <- sort(sample(1:nrow(houTemp), size=round(0.75*nrow(houTemp)), replace=FALSE))
houTempTrain <- houTemp[idxTrainHOU, ]
houTempTest <- houTemp[-idxTrainHOU, ]
# Using own data, object not returned
houPredsMonth <- runFullRF(dfTrain=houTempTrain %>% filter(year(date) < 2022),
yVar="month",
xVars=varsTrain,
dfTest=houTempTest %>% filter(year(date)==2022),
useLabel=keyLabel,
useSub=stringr::str_to_sentence(keyLabel),
returnData=TRUE
)[["tstPred"]]
##
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.339%
# Using previous city data
newPredsMonthHOU <- newCityPredict(rfMonth,
dfTest=houTempTest %>% filter(year(date)==2022),
trueCol="month",
isContVar=FALSE,
useSub="Previous city percentile model applied to make new city predictions"
)[["tstPred"]][,c("month", "pred")]
##
## Accuracy of requested data is: 69.79%
Month is predicted with ~70% accuracy and most errors being +/- 1 month when applying the old city model to the new city data. Accuracy decreases (?) to ~60% when using new city data as inputs to the model